From 200d9c9a87532fc127abfd6e254a02d962e52163 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 25 Jul 2024 12:30:29 -0700 Subject: [PATCH] k --- .../vespa/app_config/schemas/danswer_chunk.sd | 59 ++++++++----------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index 67ece1527..280315c2e 100644 --- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -20,18 +20,10 @@ schema DANSWER_CHUNK_NAME { # `semantic_identifier` will be the channel name, but the `title` will be empty field title type string { indexing: summary | index | attribute - match { - gram - gram-size: 3 - } index: enable-bm25 } field content type string { indexing: summary | index - match { - gram - gram-size: 3 - } index: enable-bm25 } # duplication of `content` is far from ideal, but is needed for @@ -153,43 +145,44 @@ schema DANSWER_CHUNK_NAME { query(query_embedding) tensor(x[VARIABLE_DIM]) } - # This must be separate function for normalize_linear to work - function vector_score() { + function title_vector_score() { expression { - # If no title, the full vector score comes from the content embedding - (query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) + - ((1 - query(title_content_ratio)) * closeness(field, embeddings)) - } - } - - # This must be separate function for normalize_linear to work - function keyword_score() { - expression { - (query(title_content_ratio) * bm25(title)) + - ((1 - query(title_content_ratio)) * bm25(content)) + # If no good matching titles, then it should use the context embeddings rather than having some + # irrelevant title have a vector score of 1. This way at least it will be the doc with the highest + # matching content score getting the full score + max(closeness(field, embeddings), closeness(field, title_embedding)) } } first-phase { - expression: vector_score + expression: closeness(field, embeddings) } # Weighted average between Vector Search and BM-25 - # Each is a weighted average between the Title and Content fields - # Finally each doc is boosted by it's user feedback based boost and recency - # If any embedding or index field is missing, it just receives a score of 0 - # Assumptions: - # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution - # therefore not normalizing before combining. - # - For documents without title, it gets a score of 0 for that and this is ok as documents - # without any title match should be penalized. global-phase { expression { ( # Weighted Vector Similarity Score - (query(alpha) * normalize_linear(vector_score)) + + ( + query(alpha) * ( + (query(title_content_ratio) * normalize_linear(title_vector_score)) + + + ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings))) + ) + ) + + + + # Weighted Keyword Similarity Score - ((1 - query(alpha)) * normalize_linear(keyword_score)) + # Note: for the BM25 Title score, it requires decent stopword removal in the query + # This needs to be the case so there aren't irrelevant titles being normalized to a score of 1 + ( + (1 - query(alpha)) * ( + (query(title_content_ratio) * normalize_linear(bm25(title))) + + + ((1 - query(title_content_ratio)) * normalize_linear(bm25(content))) + ) + ) ) # Boost based on user feedback * document_boost @@ -204,8 +197,6 @@ schema DANSWER_CHUNK_NAME { bm25(content) closeness(field, title_embedding) closeness(field, embeddings) - keyword_score - vector_score document_boost recency_bias closest(embeddings)