diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py index e23ede24b..d10ab00f8 100644 --- a/backend/danswer/configs/model_configs.py +++ b/backend/danswer/configs/model_configs.py @@ -18,6 +18,9 @@ CROSS_ENCODER_MODEL_ENSEMBLE = [ "cross-encoder/ms-marco-TinyBERT-L-2-v2", ] +# Better to keep it loose, surfacing more results better than missing results +SEARCH_DISTANCE_CUTOFF = 0.1 # Cosine similarity (currently), range of -1 to 1 with -1 being completely opposite + QUERY_MAX_CONTEXT_SIZE = 256 # The below is correlated with CHUNK_SIZE in app_configs but not strictly calculated # To avoid extra overhead of tokenizing for chunking during indexing. diff --git a/backend/danswer/datastores/qdrant/store.py b/backend/danswer/datastores/qdrant/store.py index 4ff71626f..333e67906 100644 --- a/backend/danswer/datastores/qdrant/store.py +++ b/backend/danswer/datastores/qdrant/store.py @@ -7,6 +7,7 @@ from danswer.configs.app_configs import NUM_RETURNED_HITS from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION from danswer.configs.constants import ALLOWED_USERS from danswer.configs.constants import PUBLIC_DOC_PAT +from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF from danswer.datastores.datastore_utils import get_uuid_from_chunk from danswer.datastores.interfaces import IndexFilter from danswer.datastores.interfaces import VectorIndex @@ -92,7 +93,8 @@ class QdrantIndex(VectorIndex): user_id: int | None, filters: list[IndexFilter] | None, num_to_retrieve: int = NUM_RETURNED_HITS, - page_size: int = NUM_RERANKED_RESULTS, + page_size: int = NUM_RETURNED_HITS, + distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF, ) -> list[InferenceChunk]: query_embedding = get_default_embedding_model().encode( query @@ -113,6 +115,7 @@ class QdrantIndex(VectorIndex): query_filter=Filter(must=list(filter_conditions)), limit=page_size, offset=page_offset, + score_threshold=distance_cutoff, ) page_offset += page_size if not hits: diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py index 0e84982a5..bf074d17f 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/semantic_search.py @@ -64,6 +64,8 @@ def retrieve_ranked_documents( num_hits: int = NUM_RETURNED_HITS, num_rerank: int = NUM_RERANKED_RESULTS, ) -> tuple[list[InferenceChunk] | None, list[InferenceChunk] | None]: + """Uses vector similarity to fetch the top num_hits document chunks with a distance cutoff. + Reranks the top num_rerank out of those (instead of all due to latency)""" top_chunks = datastore.semantic_retrieval(query, user_id, filters, num_hits) if not top_chunks: filters_log_msg = json.dumps(filters, separators=(",", ":")).replace("\n", "")