mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 20:08:36 +02:00
Fix Vespa Title Overly Punished when Missing (#995)
This commit is contained in:
parent
7174ea3908
commit
50086526e2
@ -9,6 +9,7 @@ SOURCE_LINKS = "source_links"
|
||||
SOURCE_LINK = "link"
|
||||
SEMANTIC_IDENTIFIER = "semantic_identifier"
|
||||
TITLE = "title"
|
||||
SKIP_TITLE_EMBEDDING = "skip_title"
|
||||
SECTION_CONTINUATION = "section_continuation"
|
||||
EMBEDDINGS = "embeddings"
|
||||
TITLE_EMBEDDING = "title_embedding"
|
||||
|
@ -11,6 +11,11 @@ schema danswer_chunk {
|
||||
field semantic_identifier type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# Must have an additional field for whether to skip title embeddings
|
||||
# This information cannot be extracted from either the title field nor title embedding
|
||||
field skip_title type bool {
|
||||
indexing: attribute
|
||||
}
|
||||
# May not always match the `semantic_identifier` e.g. for Slack docs the
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
@ -149,7 +154,7 @@ schema danswer_chunk {
|
||||
function vector_score() {
|
||||
expression {
|
||||
# If no title, the full vector score comes from the content embedding
|
||||
(query(title_content_ratio) * if(isNan(attribute(title)) == 1, closeness(field, embeddings), closeness(field, title_embedding))) +
|
||||
(query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
|
||||
((1 - query(title_content_ratio)) * closeness(field, embeddings))
|
||||
}
|
||||
}
|
||||
|
@ -48,6 +48,7 @@ from danswer.configs.constants import RECENCY_BIAS
|
||||
from danswer.configs.constants import SECONDARY_OWNERS
|
||||
from danswer.configs.constants import SECTION_CONTINUATION
|
||||
from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SKIP_TITLE_EMBEDDING
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
@ -256,6 +257,7 @@ def _index_vespa_chunk(
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
BLURB: remove_invalid_unicode_chars(chunk.blurb),
|
||||
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
||||
SKIP_TITLE_EMBEDDING: not title,
|
||||
CONTENT: remove_invalid_unicode_chars(chunk.content),
|
||||
# This duplication of `content` is needed for keyword highlighting :(
|
||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
|
||||
@ -560,6 +562,7 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
|
||||
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
|
||||
|
||||
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
|
||||
# Good Debugging Spot
|
||||
return inference_chunks
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user