Enable Dedup Flag for Doc Search Endpoint

2025-07-09 06:02:00 +02:00 · 2024-06-06 14:47:13 -07:00
parent bdcfb39724
commit 1c343bbee7
2 changed files with 20 additions and 2 deletions
--- a/backend/danswer/search/utils.py
+++ b/backend/danswer/search/utils.py
@ -4,6 +4,7 @@ from typing import TypeVar
 from danswer.db.models import SearchDoc as DBSearchDoc
 from danswer.search.models import InferenceChunk
 from danswer.search.models import InferenceSection
+from danswer.search.models import SavedSearchDoc
 from danswer.search.models import SearchDoc


@ -24,7 +25,9 @@ def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:


 def drop_llm_indices(
-    llm_indices: list[int], search_docs: list[DBSearchDoc], dropped_indices: list[int]
+    llm_indices: list[int],
+    search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
+    dropped_indices: list[int],
 ) -> list[int]:
    llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
    if dropped_indices:
--- a/backend/ee/danswer/server/query_and_chat/query_backend.py
+++ b/backend/ee/danswer/server/query_and_chat/query_backend.py
@ -21,6 +21,8 @@ from danswer.search.models import SearchRequest
 from danswer.search.models import SearchResponse
 from danswer.search.pipeline import SearchPipeline
 from danswer.search.utils import chunks_or_sections_to_search_docs
+from danswer.search.utils import dedupe_documents
+from danswer.search.utils import drop_llm_indices
 from danswer.utils.logger import setup_logger
 from ee.danswer.server.query_and_chat.models import DocumentSearchRequest

@ -63,8 +65,21 @@ def handle_search_request(
    relevant_chunk_indices = search_pipeline.relevant_chunk_indices
    top_docs = chunks_or_sections_to_search_docs(top_sections)

+    # Deduping happens at the last step to avoid harming quality by dropping content early on
+    deduped_docs = top_docs
+    dropped_inds = None
+    if search_request.retrieval_options.dedupe_docs:
+        deduped_docs, dropped_inds = dedupe_documents(top_docs)
+
    # No need to save the docs for this API
-    fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in top_docs]
+    fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in deduped_docs]
+
+    if dropped_inds:
+        relevant_chunk_indices = drop_llm_indices(
+            llm_indices=relevant_chunk_indices,
+            search_docs=fake_saved_docs,
+            dropped_indices=dropped_inds,
+        )

    return SearchResponse(
        top_documents=fake_saved_docs, llm_indices=relevant_chunk_indices