Fix Vespa Title Overly Punished when Missing (#995)

2025-04-08 20:08:36 +02:00 · 2024-01-24 15:13:36 -08:00 · 2024-01-24 15:13:36 -08:00 · 50086526e2
commit 50086526e2
parent 7174ea3908
3 changed files with 10 additions and 1 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -9,6 +9,7 @@ SOURCE_LINKS = "source_links"
 SOURCE_LINK = "link"
 SEMANTIC_IDENTIFIER = "semantic_identifier"
 TITLE = "title"
+SKIP_TITLE_EMBEDDING = "skip_title"
 SECTION_CONTINUATION = "section_continuation"
 EMBEDDINGS = "embeddings"
 TITLE_EMBEDDING = "title_embedding"
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -11,6 +11,11 @@ schema danswer_chunk {
        field semantic_identifier type string {
            indexing: summary | attribute
        }
+        # Must have an additional field for whether to skip title embeddings
+        # This information cannot be extracted from either the title field nor title embedding
+        field skip_title type bool {
+            indexing: attribute
+        }
        # May not always match the `semantic_identifier` e.g. for Slack docs the
        # `semantic_identifier` will be the channel name, but the `title` will be empty
        field title type string {
@ -149,7 +154,7 @@ schema danswer_chunk {
        function vector_score() {
            expression {
                # If no title, the full vector score comes from the content embedding
-                (query(title_content_ratio) * if(isNan(attribute(title)) == 1, closeness(field, embeddings), closeness(field, title_embedding))) +
+                (query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
                ((1 - query(title_content_ratio)) * closeness(field, embeddings))
            }
        }
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@ -48,6 +48,7 @@ from danswer.configs.constants import RECENCY_BIAS
 from danswer.configs.constants import SECONDARY_OWNERS
 from danswer.configs.constants import SECTION_CONTINUATION
 from danswer.configs.constants import SEMANTIC_IDENTIFIER
+from danswer.configs.constants import SKIP_TITLE_EMBEDDING
 from danswer.configs.constants import SOURCE_LINKS
 from danswer.configs.constants import SOURCE_TYPE
 from danswer.configs.constants import TITLE
@ -256,6 +257,7 @@ def _index_vespa_chunk(
        CHUNK_ID: chunk.chunk_id,
        BLURB: remove_invalid_unicode_chars(chunk.blurb),
        TITLE: remove_invalid_unicode_chars(title) if title else None,
+        SKIP_TITLE_EMBEDDING: not title,
        CONTENT: remove_invalid_unicode_chars(chunk.content),
        # This duplication of `content` is needed for keyword highlighting :(
        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
@ -560,6 +562,7 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
    filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]

    inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
+    # Good Debugging Spot
    return inference_chunks