Enable Dedup Flag for Doc Search Endpoint

This commit is contained in:
Yuhong Sun
2024-06-06 14:47:13 -07:00
committed by Chris Weaver
parent bdcfb39724
commit 1c343bbee7
2 changed files with 20 additions and 2 deletions

View File

@ -4,6 +4,7 @@ from typing import TypeVar
from danswer.db.models import SearchDoc as DBSearchDoc from danswer.db.models import SearchDoc as DBSearchDoc
from danswer.search.models import InferenceChunk from danswer.search.models import InferenceChunk
from danswer.search.models import InferenceSection from danswer.search.models import InferenceSection
from danswer.search.models import SavedSearchDoc
from danswer.search.models import SearchDoc from danswer.search.models import SearchDoc
@ -24,7 +25,9 @@ def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
def drop_llm_indices( def drop_llm_indices(
llm_indices: list[int], search_docs: list[DBSearchDoc], dropped_indices: list[int] llm_indices: list[int],
search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
dropped_indices: list[int],
) -> list[int]: ) -> list[int]:
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))] llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
if dropped_indices: if dropped_indices:

View File

@ -21,6 +21,8 @@ from danswer.search.models import SearchRequest
from danswer.search.models import SearchResponse from danswer.search.models import SearchResponse
from danswer.search.pipeline import SearchPipeline from danswer.search.pipeline import SearchPipeline
from danswer.search.utils import chunks_or_sections_to_search_docs from danswer.search.utils import chunks_or_sections_to_search_docs
from danswer.search.utils import dedupe_documents
from danswer.search.utils import drop_llm_indices
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
from ee.danswer.server.query_and_chat.models import DocumentSearchRequest from ee.danswer.server.query_and_chat.models import DocumentSearchRequest
@ -63,8 +65,21 @@ def handle_search_request(
relevant_chunk_indices = search_pipeline.relevant_chunk_indices relevant_chunk_indices = search_pipeline.relevant_chunk_indices
top_docs = chunks_or_sections_to_search_docs(top_sections) top_docs = chunks_or_sections_to_search_docs(top_sections)
# Deduping happens at the last step to avoid harming quality by dropping content early on
deduped_docs = top_docs
dropped_inds = None
if search_request.retrieval_options.dedupe_docs:
deduped_docs, dropped_inds = dedupe_documents(top_docs)
# No need to save the docs for this API # No need to save the docs for this API
fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in top_docs] fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in deduped_docs]
if dropped_inds:
relevant_chunk_indices = drop_llm_indices(
llm_indices=relevant_chunk_indices,
search_docs=fake_saved_docs,
dropped_indices=dropped_inds,
)
return SearchResponse( return SearchResponse(
top_documents=fake_saved_docs, llm_indices=relevant_chunk_indices top_documents=fake_saved_docs, llm_indices=relevant_chunk_indices