Expanded basic search (#4517)

* initial working version

* ranking profile

* modification for keyword/instruction retrieval

* mypy fixes

* EL comments

* added env var (True for now)

* flipped default to False

* mypy & final EL/CW comments + import issue
This commit is contained in:
joachim-danswer
2025-04-13 23:13:01 -07:00
committed by GitHub
parent e3aab8e85e
commit 2683207a24
14 changed files with 438 additions and 36 deletions

View File

@ -4,6 +4,8 @@ from datetime import datetime
from typing import Any
from onyx.access.models import DocumentAccess
from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceChunkUncleaned
from onyx.db.enums import EmbeddingPrecision
@ -351,7 +353,9 @@ class HybridCapable(abc.ABC):
hybrid_alpha: float,
time_decay_multiplier: float,
num_to_retrieve: int,
ranking_profile_type: QueryExpansionType,
offset: int = 0,
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
) -> list[InferenceChunkUncleaned]:
"""
Run hybrid search and return a list of inference chunks.

View File

@ -176,7 +176,7 @@ schema DANSWER_CHUNK_NAME {
match-features: recency_bias
}
rank-profile hybrid_searchVARIABLE_DIM inherits default, default_rank {
rank-profile hybrid_search_semantic_base_VARIABLE_DIM inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
}
@ -192,7 +192,75 @@ schema DANSWER_CHUNK_NAME {
# First phase must be vector to allow hits that have no keyword matches
first-phase {
expression: closeness(field, embeddings)
expression: query(title_content_ratio) * closeness(field, title_embedding) + (1 - query(title_content_ratio)) * closeness(field, embeddings)
}
# Weighted average between Vector Search and BM-25
global-phase {
expression {
(
# Weighted Vector Similarity Score
(
query(alpha) * (
(query(title_content_ratio) * normalize_linear(title_vector_score))
+
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
)
)
+
# Weighted Keyword Similarity Score
# Note: for the BM25 Title score, it requires decent stopword removal in the query
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
(
(1 - query(alpha)) * (
(query(title_content_ratio) * normalize_linear(bm25(title)))
+
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
)
)
)
# Boost based on user feedback
* document_boost
# Decay factor based on time document was last updated
* recency_bias
# Boost based on aggregated boost calculation
* aggregated_chunk_boost
}
rerank-count: 1000
}
match-features {
bm25(title)
bm25(content)
closeness(field, title_embedding)
closeness(field, embeddings)
document_boost
recency_bias
aggregated_chunk_boost
closest(embeddings)
}
}
rank-profile hybrid_search_keyword_base_VARIABLE_DIM inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
}
function title_vector_score() {
expression {
# If no good matching titles, then it should use the context embeddings rather than having some
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
# matching content score getting the full score
max(closeness(field, embeddings), closeness(field, title_embedding))
}
}
# First phase must be vector to allow hits that have no keyword matches
first-phase {
expression: query(title_content_ratio) * bm25(title) + (1 - query(title_content_ratio)) * bm25(content)
}
# Weighted average between Vector Search and BM-25

View File

@ -19,6 +19,7 @@ import httpx # type: ignore
import requests # type: ignore
from retry import retry
from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
from onyx.configs.chat_configs import DOC_TIME_DECAY
from onyx.configs.chat_configs import NUM_RETURNED_HITS
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@ -800,12 +801,14 @@ class VespaIndex(DocumentIndex):
hybrid_alpha: float,
time_decay_multiplier: float,
num_to_retrieve: int,
ranking_profile_type: QueryExpansionType,
offset: int = 0,
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
) -> list[InferenceChunkUncleaned]:
vespa_where_clauses = build_vespa_filters(filters)
# Needs to be at least as much as the value set in Vespa schema config
target_hits = max(10 * num_to_retrieve, 1000)
yql = (
YQL_BASE.format(index_name=self.index_name)
+ vespa_where_clauses
@ -817,6 +820,11 @@ class VespaIndex(DocumentIndex):
final_query = " ".join(final_keywords) if final_keywords else query
if ranking_profile_type == QueryExpansionType.KEYWORD:
ranking_profile = f"hybrid_search_keyword_base_{len(query_embedding)}"
else:
ranking_profile = f"hybrid_search_semantic_base_{len(query_embedding)}"
logger.debug(f"Query YQL: {yql}")
params: dict[str, str | int | float] = {
@ -832,7 +840,7 @@ class VespaIndex(DocumentIndex):
),
"hits": num_to_retrieve,
"offset": offset,
"ranking.profile": f"hybrid_search{len(query_embedding)}",
"ranking.profile": ranking_profile,
"timeout": VESPA_TIMEOUT,
}