Expanded basic search (#4517)

* initial working version * ranking profile * modification for keyword/instruction retrieval * mypy fixes * EL comments * added env var (True for now) * flipped default to False * mypy & final EL/CW comments + import issue
2025-07-05 20:21:02 +02:00 · 2025-04-13 23:13:01 -07:00
parent e3aab8e85e
commit 2683207a24
14 changed files with 438 additions and 36 deletions
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@ -4,6 +4,8 @@ from datetime import datetime
 from typing import Any

 from onyx.access.models import DocumentAccess
+from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
+from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceChunkUncleaned
 from onyx.db.enums import EmbeddingPrecision
@ -351,7 +353,9 @@ class HybridCapable(abc.ABC):
        hybrid_alpha: float,
        time_decay_multiplier: float,
        num_to_retrieve: int,
+        ranking_profile_type: QueryExpansionType,
        offset: int = 0,
+        title_content_ratio: float | None = TITLE_CONTENT_RATIO,
    ) -> list[InferenceChunkUncleaned]:
        """
        Run hybrid search and return a list of inference chunks.
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -176,7 +176,7 @@ schema DANSWER_CHUNK_NAME {
        match-features: recency_bias
    }

-    rank-profile hybrid_searchVARIABLE_DIM inherits default, default_rank {
+    rank-profile hybrid_search_semantic_base_VARIABLE_DIM inherits default, default_rank {
        inputs {
            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
        }
@ -192,7 +192,75 @@ schema DANSWER_CHUNK_NAME {

        # First phase must be vector to allow hits that have no keyword matches
        first-phase {
-            expression: closeness(field, embeddings)
+            expression: query(title_content_ratio) * closeness(field, title_embedding) + (1 - query(title_content_ratio)) * closeness(field, embeddings)
+        }
+
+        # Weighted average between Vector Search and BM-25
+        global-phase {
+            expression {
+                (
+                    # Weighted Vector Similarity Score
+                    (
+                        query(alpha) * (
+                            (query(title_content_ratio) * normalize_linear(title_vector_score))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
+                        )
+                    )
+
+                    +
+
+                    # Weighted Keyword Similarity Score
+                    # Note: for the BM25 Title score, it requires decent stopword removal in the query
+                    # This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
+                    (
+                        (1 - query(alpha)) * (
+                            (query(title_content_ratio) * normalize_linear(bm25(title)))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
+                        )
+                    )
+                )
+                # Boost based on user feedback
+                * document_boost
+                # Decay factor based on time document was last updated
+                * recency_bias
+                # Boost based on aggregated boost calculation
+                * aggregated_chunk_boost
+            }
+            rerank-count: 1000
+        }
+
+        match-features {
+            bm25(title)
+            bm25(content)
+            closeness(field, title_embedding)
+            closeness(field, embeddings)
+            document_boost
+            recency_bias
+            aggregated_chunk_boost
+            closest(embeddings)
+        }
+    }
+
+
+    rank-profile hybrid_search_keyword_base_VARIABLE_DIM inherits default, default_rank {
+        inputs {
+            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
+        }
+
+        function title_vector_score() {
+            expression {
+                # If no good matching titles, then it should use the context embeddings rather than having some
+                # irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
+                # matching content score getting the full score
+                max(closeness(field, embeddings), closeness(field, title_embedding))
+            }
+        }
+
+        # First phase must be vector to allow hits that have no keyword matches
+        first-phase {
+            expression: query(title_content_ratio) * bm25(title) + (1 - query(title_content_ratio)) * bm25(content)
        }

        # Weighted average between Vector Search and BM-25
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -19,6 +19,7 @@ import httpx  # type: ignore
 import requests  # type: ignore
 from retry import retry

+from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
 from onyx.configs.chat_configs import DOC_TIME_DECAY
 from onyx.configs.chat_configs import NUM_RETURNED_HITS
 from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@ -800,12 +801,14 @@ class VespaIndex(DocumentIndex):
        hybrid_alpha: float,
        time_decay_multiplier: float,
        num_to_retrieve: int,
+        ranking_profile_type: QueryExpansionType,
        offset: int = 0,
        title_content_ratio: float | None = TITLE_CONTENT_RATIO,
    ) -> list[InferenceChunkUncleaned]:
        vespa_where_clauses = build_vespa_filters(filters)
        # Needs to be at least as much as the value set in Vespa schema config
        target_hits = max(10 * num_to_retrieve, 1000)
+
        yql = (
            YQL_BASE.format(index_name=self.index_name)
            + vespa_where_clauses
@ -817,6 +820,11 @@ class VespaIndex(DocumentIndex):

        final_query = " ".join(final_keywords) if final_keywords else query

+        if ranking_profile_type == QueryExpansionType.KEYWORD:
+            ranking_profile = f"hybrid_search_keyword_base_{len(query_embedding)}"
+        else:
+            ranking_profile = f"hybrid_search_semantic_base_{len(query_embedding)}"
+
        logger.debug(f"Query YQL: {yql}")

        params: dict[str, str | int | float] = {
@ -832,7 +840,7 @@ class VespaIndex(DocumentIndex):
            ),
            "hits": num_to_retrieve,
            "offset": offset,
-            "ranking.profile": f"hybrid_search{len(query_embedding)}",
+            "ranking.profile": ranking_profile,
            "timeout": VESPA_TIMEOUT,
        }