diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 7afd3902d..8e6562e04 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -9,6 +9,7 @@ SOURCE_LINKS = "source_links" SOURCE_LINK = "link" SEMANTIC_IDENTIFIER = "semantic_identifier" TITLE = "title" +SKIP_TITLE_EMBEDDING = "skip_title" SECTION_CONTINUATION = "section_continuation" EMBEDDINGS = "embeddings" TITLE_EMBEDDING = "title_embedding" diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index 48c52ec94..3b6a2ec43 100644 --- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -11,6 +11,11 @@ schema danswer_chunk { field semantic_identifier type string { indexing: summary | attribute } + # Must have an additional field for whether to skip title embeddings + # This information cannot be extracted from either the title field nor title embedding + field skip_title type bool { + indexing: attribute + } # May not always match the `semantic_identifier` e.g. for Slack docs the # `semantic_identifier` will be the channel name, but the `title` will be empty field title type string { @@ -149,7 +154,7 @@ schema danswer_chunk { function vector_score() { expression { # If no title, the full vector score comes from the content embedding - (query(title_content_ratio) * if(isNan(attribute(title)) == 1, closeness(field, embeddings), closeness(field, title_embedding))) + + (query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) + ((1 - query(title_content_ratio)) * closeness(field, embeddings)) } } diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 3ab3d7426..0679231e6 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -48,6 +48,7 @@ from danswer.configs.constants import RECENCY_BIAS from danswer.configs.constants import SECONDARY_OWNERS from danswer.configs.constants import SECTION_CONTINUATION from danswer.configs.constants import SEMANTIC_IDENTIFIER +from danswer.configs.constants import SKIP_TITLE_EMBEDDING from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE from danswer.configs.constants import TITLE @@ -256,6 +257,7 @@ def _index_vespa_chunk( CHUNK_ID: chunk.chunk_id, BLURB: remove_invalid_unicode_chars(chunk.blurb), TITLE: remove_invalid_unicode_chars(title) if title else None, + SKIP_TITLE_EMBEDDING: not title, CONTENT: remove_invalid_unicode_chars(chunk.content), # This duplication of `content` is needed for keyword highlighting :( CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content), @@ -560,6 +562,7 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None] inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits] + # Good Debugging Spot return inference_chunks