improved basic search latency (#4186)

* improved basic search latency * address PR comments + minor cleanup
2025-07-28 13:53:28 +02:00 · 2025-03-06 14:22:59 -08:00
parent 29382656fc
commit b7da91e3ae
23 changed files with 460 additions and 149 deletions
--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -16,7 +16,7 @@ from onyx.db.models import SearchSettings
 from onyx.indexing.models import BaseChunk
 from onyx.indexing.models import IndexingSetting
 from shared_configs.enums import RerankerProvider
-
+from shared_configs.model_server_models import Embedding

 MAX_METRICS_CONTENT = (
    200  # Just need enough characters to identify where in the doc the chunk is
@@ -151,6 +151,10 @@ class SearchRequest(ChunkContext):
    evaluation_type: LLMEvaluationType = LLMEvaluationType.UNSPECIFIED
    model_config = ConfigDict(arbitrary_types_allowed=True)

+    precomputed_query_embedding: Embedding | None = None
+    precomputed_is_keyword: bool | None = None
+    precomputed_keywords: list[str] | None = None
+

 class SearchQuery(ChunkContext):
    "Processed Request that is directly passed to the SearchPipeline"
@@ -175,6 +179,8 @@ class SearchQuery(ChunkContext):
    offset: int = 0
    model_config = ConfigDict(frozen=True)

+    precomputed_query_embedding: Embedding | None = None
+

 class RetrievalDetails(ChunkContext):
    # Use LLM to determine whether to do a retrieval or only rely on existing history
--- a/backend/onyx/context/search/pipeline.py
+++ b/backend/onyx/context/search/pipeline.py
@@ -331,6 +331,14 @@ class SearchPipeline:
        self._retrieved_sections = expanded_inference_sections
        return expanded_inference_sections

+    @property
+    def retrieved_sections(self) -> list[InferenceSection]:
+        if self._retrieved_sections is not None:
+            return self._retrieved_sections
+
+        self._retrieved_sections = self._get_sections()
+        return self._retrieved_sections
+
    @property
    def reranked_sections(self) -> list[InferenceSection]:
        """Reranking is always done at the chunk level since section merging could create arbitrarily
@@ -343,7 +351,7 @@ class SearchPipeline:
        if self._reranked_sections is not None:
            return self._reranked_sections

-        retrieved_sections = self._get_sections()
+        retrieved_sections = self.retrieved_sections
        if self.retrieved_sections_callback is not None:
            self.retrieved_sections_callback(retrieved_sections)

--- a/backend/onyx/context/search/preprocessing/preprocessing.py
+++ b/backend/onyx/context/search/preprocessing/preprocessing.py
@@ -117,8 +117,12 @@ def retrieval_preprocessing(
        else None
    )

+    # Sometimes this is pre-computed in parallel with other heavy tasks to improve
+    # latency, and in that case we don't need to run the model again
    run_query_analysis = (
-        None if skip_query_analysis else FunctionCall(query_analysis, (query,), {})
+        None
+        if (skip_query_analysis or search_request.precomputed_is_keyword is not None)
+        else FunctionCall(query_analysis, (query,), {})
    )

    functions_to_run = [
@@ -143,11 +147,12 @@ def retrieval_preprocessing(

    # The extracted keywords right now are not very reliable, not using for now
    # Can maybe use for highlighting
-    is_keyword, extracted_keywords = (
-        parallel_results[run_query_analysis.result_id]
-        if run_query_analysis
-        else (False, None)
-    )
+    is_keyword, _extracted_keywords = False, None
+    if search_request.precomputed_is_keyword is not None:
+        is_keyword = search_request.precomputed_is_keyword
+        _extracted_keywords = search_request.precomputed_keywords
+    elif run_query_analysis:
+        is_keyword, _extracted_keywords = parallel_results[run_query_analysis.result_id]

    all_query_terms = query.split()
    processed_keywords = (
@@ -247,4 +252,5 @@ def retrieval_preprocessing(
        chunks_above=chunks_above,
        chunks_below=chunks_below,
        full_doc=search_request.full_doc,
+        precomputed_query_embedding=search_request.precomputed_query_embedding,
    )
--- a/backend/onyx/context/search/retrieval/search_runner.py
+++ b/backend/onyx/context/search/retrieval/search_runner.py
@@ -31,7 +31,7 @@ from onyx.utils.timing import log_function_time
 from shared_configs.configs import MODEL_SERVER_HOST
 from shared_configs.configs import MODEL_SERVER_PORT
 from shared_configs.enums import EmbedTextType
-
+from shared_configs.model_server_models import Embedding

 logger = setup_logger()

@@ -109,6 +109,20 @@ def combine_retrieval_results(
    return sorted_chunks


+def get_query_embedding(query: str, db_session: Session) -> Embedding:
+    search_settings = get_current_search_settings(db_session)
+
+    model = EmbeddingModel.from_db_model(
+        search_settings=search_settings,
+        # The below are globally set, this flow always uses the indexing one
+        server_host=MODEL_SERVER_HOST,
+        server_port=MODEL_SERVER_PORT,
+    )
+
+    query_embedding = model.encode([query], text_type=EmbedTextType.QUERY)[0]
+    return query_embedding
+
+
@log_function_time(print_only=True)
 def doc_index_retrieval(
    query: SearchQuery,
@@ -121,17 +135,10 @@ def doc_index_retrieval(
    from the large chunks to the referenced chunks,
    dedupes the chunks, and cleans the chunks.
    """
-    search_settings = get_current_search_settings(db_session)
-
-    model = EmbeddingModel.from_db_model(
-        search_settings=search_settings,
-        # The below are globally set, this flow always uses the indexing one
-        server_host=MODEL_SERVER_HOST,
-        server_port=MODEL_SERVER_PORT,
+    query_embedding = query.precomputed_query_embedding or get_query_embedding(
+        query.query, db_session
    )

-    query_embedding = model.encode([query.query], text_type=EmbedTextType.QUERY)[0]
-
    top_chunks = document_index.hybrid_retrieval(
        query=query.query,
        query_embedding=query_embedding,
@@ -250,6 +257,9 @@ def retrieve_chunks(
            simplified_queries.add(simplified_rephrase)

            q_copy = query.copy(update={"query": rephrase}, deep=True)
+            q_copy.precomputed_query_embedding = (
+                None  # need to recompute for each rephrase
+            )
            run_queries.append(
                (
                    doc_index_retrieval,