Fix Divide by Zero Edge Case (#535)

This commit is contained in:
Yuhong Sun 2023-10-08 09:30:30 -07:00 committed by GitHub
parent 7d3f8b7c8c
commit a6e6be4037
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 14 deletions

View File

@ -38,6 +38,9 @@ CROSS_ENCODER_MODEL_ENSEMBLE = [
"cross-encoder/ms-marco-MiniLM-L-4-v2",
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
]
# For score normalizing purposes, only way is to know the expected ranges
CROSS_ENCODER_RANGE_MAX = 12
CROSS_ENCODER_RANGE_MIN = -12
CROSS_EMBED_CONTEXT_SIZE = 512

View File

@ -16,6 +16,8 @@ from danswer.configs.app_configs import NUM_RETURNED_HITS
from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX
from danswer.configs.model_configs import ASYM_QUERY_PREFIX
from danswer.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
from danswer.configs.model_configs import SIM_SCORE_RANGE_HIGH
from danswer.configs.model_configs import SIM_SCORE_RANGE_LOW
@ -65,9 +67,9 @@ def semantic_reranking(
query: str,
chunks: list[InferenceChunk],
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
model_min: int = CROSS_ENCODER_RANGE_MIN,
model_max: int = CROSS_ENCODER_RANGE_MAX,
) -> list[InferenceChunk]:
model_max = 12 # These are just based on observations from model selection
model_min = -12
cross_encoders = get_default_reranking_model_ensemble()
sim_scores = [
encoder.predict([(query, chunk.content) for chunk in chunks]) # type: ignore
@ -132,23 +134,31 @@ def apply_boost(
score_max = max(scores)
score_range = score_max - score_min
boosted_scores = [
((score - score_min) / score_range) * boost
for score, boost in zip(scores, boosts)
]
unnormed_boosted_scores = [
score * score_range + score_min for score in boosted_scores
]
if score_range != 0:
boosted_scores = [
((score - score_min) / score_range) * boost
for score, boost in zip(scores, boosts)
]
unnormed_boosted_scores = [
score * score_range + score_min for score in boosted_scores
]
else:
unnormed_boosted_scores = [
score * boost for score, boost in zip(scores, boosts)
]
norm_min = min(norm_min, min(scores))
norm_max = max(norm_max, max(scores))
# This should never be 0 unless user has done some weird/wrong settings
norm_range = norm_max - norm_min
# For score display purposes
re_normed_scores = [
((score - norm_min) / (norm_max - norm_min))
for score in unnormed_boosted_scores
]
if norm_range != 0:
re_normed_scores = [
((score - norm_min) / norm_range) for score in unnormed_boosted_scores
]
else:
re_normed_scores = unnormed_boosted_scores
rescored_chunks = list(zip(re_normed_scores, chunks))
rescored_chunks.sort(key=lambda x: x[0], reverse=True)