Fix Title for docs without (#1827)

2025-10-09 12:47:13 +02:00 · 2024-07-14 13:51:11 -07:00
parent 56b175f597
commit da31da33e7
2 changed files with 5 additions and 4 deletions
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -605,7 +605,7 @@ def _vespa_hit_to_inference_chunk(
        section_continuation=fields[SECTION_CONTINUATION],
        document_id=fields[DOCUMENT_ID],
        source_type=fields[SOURCE_TYPE],
-        title=fields[TITLE],
+        title=fields.get(TITLE),
        semantic_identifier=fields[SEMANTIC_IDENTIFIER],
        boost=fields.get(BOOST, 1),
        recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
@@ -614,7 +614,7 @@ def _vespa_hit_to_inference_chunk(
        primary_owners=fields.get(PRIMARY_OWNERS),
        secondary_owners=fields.get(SECONDARY_OWNERS),
        metadata=metadata,
-        metadata_suffix=fields.get(METADATA_SUFFIX) or "",
+        metadata_suffix=fields.get(METADATA_SUFFIX),
        match_highlights=match_highlights,
        updated_at=updated_at,
    )
--- a/backend/danswer/search/models.py
+++ b/backend/danswer/search/models.py
@@ -190,11 +190,12 @@ class InferenceChunk(BaseChunk):


 class InferenceChunkUncleaned(InferenceChunk):
-    title: str  # Separate from Semantic Identifier though often same
-    metadata_suffix: str
+    title: str | None  # Separate from Semantic Identifier though often same
+    metadata_suffix: str | None

    def to_inference_chunk(self) -> InferenceChunk:
        # Create a dict of all fields except 'title' and 'metadata_suffix'
+        # Assumes the cleaning has already been applied and just needs to translate to the right type
        inference_chunk_data = {
            k: v
            for k, v in self.dict().items()