mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-20 00:30:10 +02:00
Fix Divide by Zero Edge Case (#535)
This commit is contained in:
parent
7d3f8b7c8c
commit
a6e6be4037
@ -38,6 +38,9 @@ CROSS_ENCODER_MODEL_ENSEMBLE = [
|
|||||||
"cross-encoder/ms-marco-MiniLM-L-4-v2",
|
"cross-encoder/ms-marco-MiniLM-L-4-v2",
|
||||||
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
|
||||||
]
|
]
|
||||||
|
# For score normalizing purposes, only way is to know the expected ranges
|
||||||
|
CROSS_ENCODER_RANGE_MAX = 12
|
||||||
|
CROSS_ENCODER_RANGE_MIN = -12
|
||||||
CROSS_EMBED_CONTEXT_SIZE = 512
|
CROSS_EMBED_CONTEXT_SIZE = 512
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,6 +16,8 @@ from danswer.configs.app_configs import NUM_RETURNED_HITS
|
|||||||
from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX
|
from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX
|
||||||
from danswer.configs.model_configs import ASYM_QUERY_PREFIX
|
from danswer.configs.model_configs import ASYM_QUERY_PREFIX
|
||||||
from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
|
from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
|
||||||
|
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
||||||
|
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
|
||||||
from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
|
from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
|
||||||
from danswer.configs.model_configs import SIM_SCORE_RANGE_HIGH
|
from danswer.configs.model_configs import SIM_SCORE_RANGE_HIGH
|
||||||
from danswer.configs.model_configs import SIM_SCORE_RANGE_LOW
|
from danswer.configs.model_configs import SIM_SCORE_RANGE_LOW
|
||||||
@ -65,9 +67,9 @@ def semantic_reranking(
|
|||||||
query: str,
|
query: str,
|
||||||
chunks: list[InferenceChunk],
|
chunks: list[InferenceChunk],
|
||||||
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
|
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
|
||||||
|
model_min: int = CROSS_ENCODER_RANGE_MIN,
|
||||||
|
model_max: int = CROSS_ENCODER_RANGE_MAX,
|
||||||
) -> list[InferenceChunk]:
|
) -> list[InferenceChunk]:
|
||||||
model_max = 12 # These are just based on observations from model selection
|
|
||||||
model_min = -12
|
|
||||||
cross_encoders = get_default_reranking_model_ensemble()
|
cross_encoders = get_default_reranking_model_ensemble()
|
||||||
sim_scores = [
|
sim_scores = [
|
||||||
encoder.predict([(query, chunk.content) for chunk in chunks]) # type: ignore
|
encoder.predict([(query, chunk.content) for chunk in chunks]) # type: ignore
|
||||||
@ -132,23 +134,31 @@ def apply_boost(
|
|||||||
score_max = max(scores)
|
score_max = max(scores)
|
||||||
score_range = score_max - score_min
|
score_range = score_max - score_min
|
||||||
|
|
||||||
boosted_scores = [
|
if score_range != 0:
|
||||||
((score - score_min) / score_range) * boost
|
boosted_scores = [
|
||||||
for score, boost in zip(scores, boosts)
|
((score - score_min) / score_range) * boost
|
||||||
]
|
for score, boost in zip(scores, boosts)
|
||||||
|
]
|
||||||
unnormed_boosted_scores = [
|
unnormed_boosted_scores = [
|
||||||
score * score_range + score_min for score in boosted_scores
|
score * score_range + score_min for score in boosted_scores
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
unnormed_boosted_scores = [
|
||||||
|
score * boost for score, boost in zip(scores, boosts)
|
||||||
|
]
|
||||||
|
|
||||||
norm_min = min(norm_min, min(scores))
|
norm_min = min(norm_min, min(scores))
|
||||||
norm_max = max(norm_max, max(scores))
|
norm_max = max(norm_max, max(scores))
|
||||||
|
# This should never be 0 unless user has done some weird/wrong settings
|
||||||
|
norm_range = norm_max - norm_min
|
||||||
|
|
||||||
# For score display purposes
|
# For score display purposes
|
||||||
re_normed_scores = [
|
if norm_range != 0:
|
||||||
((score - norm_min) / (norm_max - norm_min))
|
re_normed_scores = [
|
||||||
for score in unnormed_boosted_scores
|
((score - norm_min) / norm_range) for score in unnormed_boosted_scores
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
re_normed_scores = unnormed_boosted_scores
|
||||||
|
|
||||||
rescored_chunks = list(zip(re_normed_scores, chunks))
|
rescored_chunks = list(zip(re_normed_scores, chunks))
|
||||||
rescored_chunks.sort(key=lambda x: x[0], reverse=True)
|
rescored_chunks.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user