mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-19 14:22:10 +01:00
k
This commit is contained in:
parent
a4d71e08aa
commit
200d9c9a87
@ -20,18 +20,10 @@ schema DANSWER_CHUNK_NAME {
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
indexing: summary | index | attribute
|
||||
match {
|
||||
gram
|
||||
gram-size: 3
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
field content type string {
|
||||
indexing: summary | index
|
||||
match {
|
||||
gram
|
||||
gram-size: 3
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
# duplication of `content` is far from ideal, but is needed for
|
||||
@ -153,43 +145,44 @@ schema DANSWER_CHUNK_NAME {
|
||||
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
|
||||
}
|
||||
|
||||
# This must be separate function for normalize_linear to work
|
||||
function vector_score() {
|
||||
function title_vector_score() {
|
||||
expression {
|
||||
# If no title, the full vector score comes from the content embedding
|
||||
(query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
|
||||
((1 - query(title_content_ratio)) * closeness(field, embeddings))
|
||||
}
|
||||
}
|
||||
|
||||
# This must be separate function for normalize_linear to work
|
||||
function keyword_score() {
|
||||
expression {
|
||||
(query(title_content_ratio) * bm25(title)) +
|
||||
((1 - query(title_content_ratio)) * bm25(content))
|
||||
# If no good matching titles, then it should use the context embeddings rather than having some
|
||||
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
|
||||
# matching content score getting the full score
|
||||
max(closeness(field, embeddings), closeness(field, title_embedding))
|
||||
}
|
||||
}
|
||||
|
||||
first-phase {
|
||||
expression: vector_score
|
||||
expression: closeness(field, embeddings)
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
# Each is a weighted average between the Title and Content fields
|
||||
# Finally each doc is boosted by it's user feedback based boost and recency
|
||||
# If any embedding or index field is missing, it just receives a score of 0
|
||||
# Assumptions:
|
||||
# - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
|
||||
# therefore not normalizing before combining.
|
||||
# - For documents without title, it gets a score of 0 for that and this is ok as documents
|
||||
# without any title match should be penalized.
|
||||
global-phase {
|
||||
expression {
|
||||
(
|
||||
# Weighted Vector Similarity Score
|
||||
(query(alpha) * normalize_linear(vector_score)) +
|
||||
(
|
||||
query(alpha) * (
|
||||
(query(title_content_ratio) * normalize_linear(title_vector_score))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
|
||||
)
|
||||
)
|
||||
|
||||
+
|
||||
|
||||
# Weighted Keyword Similarity Score
|
||||
((1 - query(alpha)) * normalize_linear(keyword_score))
|
||||
# Note: for the BM25 Title score, it requires decent stopword removal in the query
|
||||
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
|
||||
(
|
||||
(1 - query(alpha)) * (
|
||||
(query(title_content_ratio) * normalize_linear(bm25(title)))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
|
||||
)
|
||||
)
|
||||
)
|
||||
# Boost based on user feedback
|
||||
* document_boost
|
||||
@ -204,8 +197,6 @@ schema DANSWER_CHUNK_NAME {
|
||||
bm25(content)
|
||||
closeness(field, title_embedding)
|
||||
closeness(field, embeddings)
|
||||
keyword_score
|
||||
vector_score
|
||||
document_boost
|
||||
recency_bias
|
||||
closest(embeddings)
|
||||
|
Loading…
x
Reference in New Issue
Block a user