mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-05 20:21:02 +02:00
Expanded basic search (#4517)
* initial working version * ranking profile * modification for keyword/instruction retrieval * mypy fixes * EL comments * added env var (True for now) * flipped default to False * mypy & final EL/CW comments + import issue
This commit is contained in:
@ -4,6 +4,8 @@ from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from onyx.access.models import DocumentAccess
|
||||
from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
|
||||
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
|
||||
from onyx.context.search.models import IndexFilters
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.db.enums import EmbeddingPrecision
|
||||
@ -351,7 +353,9 @@ class HybridCapable(abc.ABC):
|
||||
hybrid_alpha: float,
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
ranking_profile_type: QueryExpansionType,
|
||||
offset: int = 0,
|
||||
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
"""
|
||||
Run hybrid search and return a list of inference chunks.
|
||||
|
@ -176,7 +176,7 @@ schema DANSWER_CHUNK_NAME {
|
||||
match-features: recency_bias
|
||||
}
|
||||
|
||||
rank-profile hybrid_searchVARIABLE_DIM inherits default, default_rank {
|
||||
rank-profile hybrid_search_semantic_base_VARIABLE_DIM inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
|
||||
}
|
||||
@ -192,7 +192,75 @@ schema DANSWER_CHUNK_NAME {
|
||||
|
||||
# First phase must be vector to allow hits that have no keyword matches
|
||||
first-phase {
|
||||
expression: closeness(field, embeddings)
|
||||
expression: query(title_content_ratio) * closeness(field, title_embedding) + (1 - query(title_content_ratio)) * closeness(field, embeddings)
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
global-phase {
|
||||
expression {
|
||||
(
|
||||
# Weighted Vector Similarity Score
|
||||
(
|
||||
query(alpha) * (
|
||||
(query(title_content_ratio) * normalize_linear(title_vector_score))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
|
||||
)
|
||||
)
|
||||
|
||||
+
|
||||
|
||||
# Weighted Keyword Similarity Score
|
||||
# Note: for the BM25 Title score, it requires decent stopword removal in the query
|
||||
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
|
||||
(
|
||||
(1 - query(alpha)) * (
|
||||
(query(title_content_ratio) * normalize_linear(bm25(title)))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
|
||||
)
|
||||
)
|
||||
)
|
||||
# Boost based on user feedback
|
||||
* document_boost
|
||||
# Decay factor based on time document was last updated
|
||||
* recency_bias
|
||||
# Boost based on aggregated boost calculation
|
||||
* aggregated_chunk_boost
|
||||
}
|
||||
rerank-count: 1000
|
||||
}
|
||||
|
||||
match-features {
|
||||
bm25(title)
|
||||
bm25(content)
|
||||
closeness(field, title_embedding)
|
||||
closeness(field, embeddings)
|
||||
document_boost
|
||||
recency_bias
|
||||
aggregated_chunk_boost
|
||||
closest(embeddings)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rank-profile hybrid_search_keyword_base_VARIABLE_DIM inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
|
||||
}
|
||||
|
||||
function title_vector_score() {
|
||||
expression {
|
||||
# If no good matching titles, then it should use the context embeddings rather than having some
|
||||
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
|
||||
# matching content score getting the full score
|
||||
max(closeness(field, embeddings), closeness(field, title_embedding))
|
||||
}
|
||||
}
|
||||
|
||||
# First phase must be vector to allow hits that have no keyword matches
|
||||
first-phase {
|
||||
expression: query(title_content_ratio) * bm25(title) + (1 - query(title_content_ratio)) * bm25(content)
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
|
@ -19,6 +19,7 @@ import httpx # type: ignore
|
||||
import requests # type: ignore
|
||||
from retry import retry
|
||||
|
||||
from onyx.agents.agent_search.shared_graph_utils.models import QueryExpansionType
|
||||
from onyx.configs.chat_configs import DOC_TIME_DECAY
|
||||
from onyx.configs.chat_configs import NUM_RETURNED_HITS
|
||||
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
|
||||
@ -800,12 +801,14 @@ class VespaIndex(DocumentIndex):
|
||||
hybrid_alpha: float,
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
ranking_profile_type: QueryExpansionType,
|
||||
offset: int = 0,
|
||||
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
|
||||
) -> list[InferenceChunkUncleaned]:
|
||||
vespa_where_clauses = build_vespa_filters(filters)
|
||||
# Needs to be at least as much as the value set in Vespa schema config
|
||||
target_hits = max(10 * num_to_retrieve, 1000)
|
||||
|
||||
yql = (
|
||||
YQL_BASE.format(index_name=self.index_name)
|
||||
+ vespa_where_clauses
|
||||
@ -817,6 +820,11 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
final_query = " ".join(final_keywords) if final_keywords else query
|
||||
|
||||
if ranking_profile_type == QueryExpansionType.KEYWORD:
|
||||
ranking_profile = f"hybrid_search_keyword_base_{len(query_embedding)}"
|
||||
else:
|
||||
ranking_profile = f"hybrid_search_semantic_base_{len(query_embedding)}"
|
||||
|
||||
logger.debug(f"Query YQL: {yql}")
|
||||
|
||||
params: dict[str, str | int | float] = {
|
||||
@ -832,7 +840,7 @@ class VespaIndex(DocumentIndex):
|
||||
),
|
||||
"hits": num_to_retrieve,
|
||||
"offset": offset,
|
||||
"ranking.profile": f"hybrid_search{len(query_embedding)}",
|
||||
"ranking.profile": ranking_profile,
|
||||
"timeout": VESPA_TIMEOUT,
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user