k

2025-03-19 14:22:10 +01:00 · 2024-07-25 12:30:29 -07:00 · 2024-07-25 12:30:29 -07:00 · 200d9c9a87
commit 200d9c9a87
parent a4d71e08aa
1 changed files with 25 additions and 34 deletions
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -20,18 +20,10 @@ schema DANSWER_CHUNK_NAME {
        # `semantic_identifier` will be the channel name, but the `title` will be empty
        field title type string {
            indexing: summary | index | attribute
-            match {
-                gram
-                gram-size: 3
-            }
            index: enable-bm25
        }
        field content type string {
            indexing: summary | index
-            match {
-                gram
-                gram-size: 3
-            }
            index: enable-bm25
        }
        # duplication of `content` is far from ideal, but is needed for 
@ -153,43 +145,44 @@ schema DANSWER_CHUNK_NAME {
            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
        }

-        # This must be separate function for normalize_linear to work
-        function vector_score() {
+        function title_vector_score() {
            expression {
-                # If no title, the full vector score comes from the content embedding
-                (query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
-                ((1 - query(title_content_ratio)) * closeness(field, embeddings))
-            }
-        }
-
-        # This must be separate function for normalize_linear to work
-        function keyword_score() {
-            expression {
-                (query(title_content_ratio) * bm25(title)) +
-                ((1 - query(title_content_ratio)) * bm25(content))
+                # If no good matching titles, then it should use the context embeddings rather than having some
+                # irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
+                # matching content score getting the full score
+                max(closeness(field, embeddings), closeness(field, title_embedding))
            }
        }

        first-phase {
-            expression: vector_score
+            expression: closeness(field, embeddings)
        }

        # Weighted average between Vector Search and BM-25
-        # Each is a weighted average between the Title and Content fields
-        # Finally each doc is boosted by it's user feedback based boost and recency
-        # If any embedding or index field is missing, it just receives a score of 0
-        # Assumptions:
-        # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
-        #   therefore not normalizing before combining.
-        # - For documents without title, it gets a score of 0 for that and this is ok as documents
-        #   without any title match should be penalized.
        global-phase {
            expression {
                (
                    # Weighted Vector Similarity Score
-                    (query(alpha) * normalize_linear(vector_score)) +
+                    (
+                        query(alpha) * (
+                            (query(title_content_ratio) * normalize_linear(title_vector_score))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
+                        )
+                    )
+
+                    +
+
                    # Weighted Keyword Similarity Score
-                    ((1 - query(alpha)) * normalize_linear(keyword_score))
+                    # Note: for the BM25 Title score, it requires decent stopword removal in the query
+                    # This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
+                    (
+                        (1 - query(alpha)) * (
+                            (query(title_content_ratio) * normalize_linear(bm25(title)))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
+                        )
+                    )
                )
                # Boost based on user feedback
                * document_boost
@ -204,8 +197,6 @@ schema DANSWER_CHUNK_NAME {
            bm25(content)
            closeness(field, title_embedding)
            closeness(field, embeddings)
-            keyword_score
-            vector_score
            document_boost
            recency_bias
            closest(embeddings)