Set a minimum distance angle cutoff (#95)

This commit is contained in:
Yuhong Sun 2023-06-11 17:36:05 -07:00 committed by GitHub
parent f20563c9bc
commit 2bfbf037ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 9 additions and 1 deletions

View File

@ -18,6 +18,9 @@ CROSS_ENCODER_MODEL_ENSEMBLE = [
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
]
# Better to keep it loose, surfacing more results better than missing results
SEARCH_DISTANCE_CUTOFF = 0.1 # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite
QUERY_MAX_CONTEXT_SIZE = 256
# The below is correlated with CHUNK_SIZE in app_configs but not strictly calculated
# To avoid extra overhead of tokenizing for chunking during indexing.

View File

@ -7,6 +7,7 @@ from danswer.configs.app_configs import NUM_RETURNED_HITS
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
from danswer.configs.constants import ALLOWED_USERS
from danswer.configs.constants import PUBLIC_DOC_PAT
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.datastores.datastore_utils import get_uuid_from_chunk
from danswer.datastores.interfaces import IndexFilter
from danswer.datastores.interfaces import VectorIndex
@ -92,7 +93,8 @@ class QdrantIndex(VectorIndex):
user_id: int | None,
filters: list[IndexFilter] | None,
num_to_retrieve: int = NUM_RETURNED_HITS,
page_size: int = NUM_RERANKED_RESULTS,
page_size: int = NUM_RETURNED_HITS,
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
) -> list[InferenceChunk]:
query_embedding = get_default_embedding_model().encode(
query
@ -113,6 +115,7 @@ class QdrantIndex(VectorIndex):
query_filter=Filter(must=list(filter_conditions)),
limit=page_size,
offset=page_offset,
score_threshold=distance_cutoff,
)
page_offset += page_size
if not hits:

View File

@ -64,6 +64,8 @@ def retrieve_ranked_documents(
num_hits: int = NUM_RETURNED_HITS,
num_rerank: int = NUM_RERANKED_RESULTS,
) -> tuple[list[InferenceChunk] | None, list[InferenceChunk] | None]:
"""Uses vector similarity to fetch the top num_hits document chunks with a distance cutoff.
Reranks the top num_rerank out of those (instead of all due to latency)"""
top_chunks = datastore.semantic_retrieval(query, user_id, filters, num_hits)
if not top_chunks:
filters_log_msg = json.dumps(filters, separators=(",", ":")).replace("\n", "")