mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-22 05:50:57 +02:00
Hybrid Search (#653)
This commit is contained in:
parent
08909b40b0
commit
52c0d6e68b
@ -21,7 +21,9 @@ DOCUMENT_ENCODER_MODEL = (
|
|||||||
DOC_EMBEDDING_DIM = 384
|
DOC_EMBEDDING_DIM = 384
|
||||||
# Model should be chosen with 512 context size, ideally don't change this
|
# Model should be chosen with 512 context size, ideally don't change this
|
||||||
DOC_EMBEDDING_CONTEXT_SIZE = 512
|
DOC_EMBEDDING_CONTEXT_SIZE = 512
|
||||||
NORMALIZE_EMBEDDINGS = (os.environ.get("SKIP_RERANKING") or "False").lower() == "true"
|
NORMALIZE_EMBEDDINGS = (
|
||||||
|
os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
|
||||||
|
).lower() == "true"
|
||||||
# These are only used if reranking is turned off, to normalize the direct retrieval scores for display
|
# These are only used if reranking is turned off, to normalize the direct retrieval scores for display
|
||||||
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
|
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
|
||||||
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
|
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
|
||||||
@ -47,10 +49,8 @@ CROSS_ENCODER_RANGE_MAX = 12
|
|||||||
CROSS_ENCODER_RANGE_MIN = -12
|
CROSS_ENCODER_RANGE_MIN = -12
|
||||||
CROSS_EMBED_CONTEXT_SIZE = 512
|
CROSS_EMBED_CONTEXT_SIZE = 512
|
||||||
|
|
||||||
|
# Unused currently, can't be used with the current default encoder model due to its output range
|
||||||
# Better to keep it loose, surfacing more results better than missing results
|
SEARCH_DISTANCE_CUTOFF = 0
|
||||||
# Currently unused by Vespa
|
|
||||||
SEARCH_DISTANCE_CUTOFF = 0.1 # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite
|
|
||||||
|
|
||||||
# Intent model max context size
|
# Intent model max context size
|
||||||
QUERY_MAX_CONTEXT_SIZE = 256
|
QUERY_MAX_CONTEXT_SIZE = 256
|
||||||
|
@ -106,7 +106,7 @@ schema danswer_chunk {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function inline document_age() {
|
function inline document_age() {
|
||||||
# Time in years (3 Months if no age found)
|
# Time in years (91.3 days ~= 3 Months ~= 1 fiscal quarter if no age found)
|
||||||
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
|
expression: max(if(isNan(attribute(doc_updated_at)) == 1, 7890000, now() - attribute(doc_updated_at)) / 31536000, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,6 +122,8 @@ schema danswer_chunk {
|
|||||||
first-phase {
|
first-phase {
|
||||||
expression: bm25(content) * document_boost * recency_bias
|
expression: bm25(content) * document_boost * recency_bias
|
||||||
}
|
}
|
||||||
|
|
||||||
|
match-features: recency_bias document_boost bm25(content)
|
||||||
}
|
}
|
||||||
|
|
||||||
rank-profile semantic_search inherits default, default_rank {
|
rank-profile semantic_search inherits default, default_rank {
|
||||||
@ -135,7 +137,7 @@ schema danswer_chunk {
|
|||||||
expression: closeness(field, embeddings)
|
expression: closeness(field, embeddings)
|
||||||
}
|
}
|
||||||
|
|
||||||
match-features: recency_bias closest(embeddings)
|
match-features: recency_bias document_boost closest(embeddings)
|
||||||
}
|
}
|
||||||
|
|
||||||
rank-profile hybrid_search inherits default, default_rank {
|
rank-profile hybrid_search inherits default, default_rank {
|
||||||
@ -148,11 +150,12 @@ schema danswer_chunk {
|
|||||||
}
|
}
|
||||||
|
|
||||||
global-phase {
|
global-phase {
|
||||||
expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) * document_boost * recency_bias
|
expression: (normalize_linear(closeness(field, embeddings)) + normalize_linear(bm25(content))) / 2 * document_boost * recency_bias
|
||||||
rerank-count: 1000
|
rerank-count: 1000
|
||||||
}
|
}
|
||||||
|
|
||||||
match-features: recency_bias closest(embeddings)
|
# Cannot pass normalize_linear features in match-features
|
||||||
|
match-features: recency_bias document_boost closest(embeddings)
|
||||||
}
|
}
|
||||||
|
|
||||||
# used when searching from the admin UI for a specific doc to hide / boost
|
# used when searching from the admin UI for a specific doc to hide / boost
|
||||||
|
@ -310,13 +310,15 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
|
|||||||
|
|
||||||
def _build_time_filter(
|
def _build_time_filter(
|
||||||
cutoff: datetime | None,
|
cutoff: datetime | None,
|
||||||
untimed_doc_cutoff: timedelta = timedelta(days=62), # Slightly over 2 Months
|
# Slightly over 3 Months, approximately 1 fiscal quarter
|
||||||
|
untimed_doc_cutoff: timedelta = timedelta(days=92),
|
||||||
) -> str:
|
) -> str:
|
||||||
if not cutoff:
|
if not cutoff:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# For Documents that don't have an updated at, filter them out for queries asking for
|
# For Documents that don't have an updated at, filter them out for queries asking for
|
||||||
# very recent documents (2 months) default
|
# very recent documents (3 months) default. Documents that don't have an updated at
|
||||||
|
# time are assigned 3 months for time decay value
|
||||||
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
|
include_untimed = datetime.now(timezone.utc) - untimed_doc_cutoff > cutoff
|
||||||
cutoff_secs = int(cutoff.timestamp())
|
cutoff_secs = int(cutoff.timestamp())
|
||||||
|
|
||||||
@ -340,10 +342,6 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
|
|||||||
return filter_str
|
return filter_str
|
||||||
|
|
||||||
|
|
||||||
def _build_vespa_limit(num_to_retrieve: int, offset: int = 0) -> str:
|
|
||||||
return f" limit {num_to_retrieve} offset {offset}"
|
|
||||||
|
|
||||||
|
|
||||||
def _process_dynamic_summary(
|
def _process_dynamic_summary(
|
||||||
dynamic_summary: str, max_summary_length: int = 400
|
dynamic_summary: str, max_summary_length: int = 400
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
@ -605,7 +603,6 @@ class VespaIndex(DocumentIndex):
|
|||||||
# not working as desired
|
# not working as desired
|
||||||
+ '({grammar: "weakAnd"}userInput(@query) '
|
+ '({grammar: "weakAnd"}userInput(@query) '
|
||||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||||
+ _build_vespa_limit(num_to_retrieve)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
final_query = query_processing(query) if edit_keyword_query else query
|
final_query = query_processing(query) if edit_keyword_query else query
|
||||||
@ -615,7 +612,7 @@ class VespaIndex(DocumentIndex):
|
|||||||
"query": final_query,
|
"query": final_query,
|
||||||
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
|
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
|
||||||
"hits": num_to_retrieve,
|
"hits": num_to_retrieve,
|
||||||
"num_to_rerank": 10 * num_to_retrieve,
|
"offset": 0,
|
||||||
"ranking.profile": "keyword_search",
|
"ranking.profile": "keyword_search",
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -640,7 +637,6 @@ class VespaIndex(DocumentIndex):
|
|||||||
# needed for highlighting while the N-gram highlighting is broken /
|
# needed for highlighting while the N-gram highlighting is broken /
|
||||||
# not working as desired
|
# not working as desired
|
||||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||||
+ _build_vespa_limit(num_to_retrieve)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
query_embedding = embed_query(query)
|
query_embedding = embed_query(query)
|
||||||
@ -649,11 +645,13 @@ class VespaIndex(DocumentIndex):
|
|||||||
" ".join(remove_stop_words(query)) if edit_keyword_query else query
|
" ".join(remove_stop_words(query)) if edit_keyword_query else query
|
||||||
)
|
)
|
||||||
|
|
||||||
params = {
|
params: dict[str, str | int] = {
|
||||||
"yql": yql,
|
"yql": yql,
|
||||||
"query": query_keywords,
|
"query": query_keywords, # Needed for highlighting
|
||||||
"input.query(query_embedding)": str(query_embedding),
|
"input.query(query_embedding)": str(query_embedding),
|
||||||
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
|
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
|
||||||
|
"hits": num_to_retrieve,
|
||||||
|
"offset": 0,
|
||||||
"ranking.profile": "semantic_search",
|
"ranking.profile": "semantic_search",
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -668,8 +666,35 @@ class VespaIndex(DocumentIndex):
|
|||||||
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
||||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
# TODO introduce the real hybrid search
|
decay_multiplier = FAVOR_RECENT_DECAY_MULTIPLIER if favor_recent else 1
|
||||||
return self.semantic_retrieval(query, filters, favor_recent, num_to_retrieve)
|
vespa_where_clauses = _build_vespa_filters(filters)
|
||||||
|
# Needs to be at least as much as the value set in Vespa schema config
|
||||||
|
target_hits = max(10 * num_to_retrieve, 1000)
|
||||||
|
yql = (
|
||||||
|
VespaIndex.yql_base
|
||||||
|
+ vespa_where_clauses
|
||||||
|
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
|
||||||
|
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
|
||||||
|
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||||
|
)
|
||||||
|
|
||||||
|
query_embedding = embed_query(query)
|
||||||
|
|
||||||
|
query_keywords = (
|
||||||
|
" ".join(remove_stop_words(query)) if edit_keyword_query else query
|
||||||
|
)
|
||||||
|
|
||||||
|
params: dict[str, str | int] = {
|
||||||
|
"yql": yql,
|
||||||
|
"query": query_keywords,
|
||||||
|
"input.query(query_embedding)": str(query_embedding),
|
||||||
|
"input.query(decay_factor)": str(DOC_TIME_DECAY * decay_multiplier),
|
||||||
|
"hits": num_to_retrieve,
|
||||||
|
"offset": 0,
|
||||||
|
"ranking.profile": "hybrid_search",
|
||||||
|
}
|
||||||
|
|
||||||
|
return _query_vespa(params)
|
||||||
|
|
||||||
def admin_retrieval(
|
def admin_retrieval(
|
||||||
self,
|
self,
|
||||||
@ -686,14 +711,13 @@ class VespaIndex(DocumentIndex):
|
|||||||
# needed for highlighting while the N-gram highlighting is broken /
|
# needed for highlighting while the N-gram highlighting is broken /
|
||||||
# not working as desired
|
# not working as desired
|
||||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||||
+ _build_vespa_limit(num_to_retrieve)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
params: dict[str, str | int] = {
|
params: dict[str, str | int] = {
|
||||||
"yql": yql,
|
"yql": yql,
|
||||||
"query": query,
|
"query": query,
|
||||||
"hits": num_to_retrieve,
|
"hits": num_to_retrieve,
|
||||||
"num_to_rerank": 10 * num_to_retrieve,
|
"offset": 0,
|
||||||
"ranking.profile": "admin_search",
|
"ranking.profile": "admin_search",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user