Fix Vespa Title Overly Punished when Missing (#995)

This commit is contained in:
Yuhong Sun 2024-01-24 15:13:36 -08:00 committed by GitHub
parent 7174ea3908
commit 50086526e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 10 additions and 1 deletions

View File

@ -9,6 +9,7 @@ SOURCE_LINKS = "source_links"
SOURCE_LINK = "link"
SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SKIP_TITLE_EMBEDDING = "skip_title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"

View File

@ -11,6 +11,11 @@ schema danswer_chunk {
field semantic_identifier type string {
indexing: summary | attribute
}
# Must have an additional field for whether to skip title embeddings
# This information cannot be extracted from either the title field nor title embedding
field skip_title type bool {
indexing: attribute
}
# May not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
@ -149,7 +154,7 @@ schema danswer_chunk {
function vector_score() {
expression {
# If no title, the full vector score comes from the content embedding
(query(title_content_ratio) * if(isNan(attribute(title)) == 1, closeness(field, embeddings), closeness(field, title_embedding))) +
(query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
((1 - query(title_content_ratio)) * closeness(field, embeddings))
}
}

View File

@ -48,6 +48,7 @@ from danswer.configs.constants import RECENCY_BIAS
from danswer.configs.constants import SECONDARY_OWNERS
from danswer.configs.constants import SECTION_CONTINUATION
from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SKIP_TITLE_EMBEDDING
from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
@ -256,6 +257,7 @@ def _index_vespa_chunk(
CHUNK_ID: chunk.chunk_id,
BLURB: remove_invalid_unicode_chars(chunk.blurb),
TITLE: remove_invalid_unicode_chars(title) if title else None,
SKIP_TITLE_EMBEDDING: not title,
CONTENT: remove_invalid_unicode_chars(chunk.content),
# This duplication of `content` is needed for keyword highlighting :(
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
@ -560,6 +562,7 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
# Good Debugging Spot
return inference_chunks