mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-07 13:10:24 +02:00
Enable Dedup Flag for Doc Search Endpoint
This commit is contained in:
@ -4,6 +4,7 @@ from typing import TypeVar
|
|||||||
from danswer.db.models import SearchDoc as DBSearchDoc
|
from danswer.db.models import SearchDoc as DBSearchDoc
|
||||||
from danswer.search.models import InferenceChunk
|
from danswer.search.models import InferenceChunk
|
||||||
from danswer.search.models import InferenceSection
|
from danswer.search.models import InferenceSection
|
||||||
|
from danswer.search.models import SavedSearchDoc
|
||||||
from danswer.search.models import SearchDoc
|
from danswer.search.models import SearchDoc
|
||||||
|
|
||||||
|
|
||||||
@ -24,7 +25,9 @@ def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
|
|||||||
|
|
||||||
|
|
||||||
def drop_llm_indices(
|
def drop_llm_indices(
|
||||||
llm_indices: list[int], search_docs: list[DBSearchDoc], dropped_indices: list[int]
|
llm_indices: list[int],
|
||||||
|
search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
|
||||||
|
dropped_indices: list[int],
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
|
llm_bools = [True if i in llm_indices else False for i in range(len(search_docs))]
|
||||||
if dropped_indices:
|
if dropped_indices:
|
||||||
|
@ -21,6 +21,8 @@ from danswer.search.models import SearchRequest
|
|||||||
from danswer.search.models import SearchResponse
|
from danswer.search.models import SearchResponse
|
||||||
from danswer.search.pipeline import SearchPipeline
|
from danswer.search.pipeline import SearchPipeline
|
||||||
from danswer.search.utils import chunks_or_sections_to_search_docs
|
from danswer.search.utils import chunks_or_sections_to_search_docs
|
||||||
|
from danswer.search.utils import dedupe_documents
|
||||||
|
from danswer.search.utils import drop_llm_indices
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from ee.danswer.server.query_and_chat.models import DocumentSearchRequest
|
from ee.danswer.server.query_and_chat.models import DocumentSearchRequest
|
||||||
|
|
||||||
@ -63,8 +65,21 @@ def handle_search_request(
|
|||||||
relevant_chunk_indices = search_pipeline.relevant_chunk_indices
|
relevant_chunk_indices = search_pipeline.relevant_chunk_indices
|
||||||
top_docs = chunks_or_sections_to_search_docs(top_sections)
|
top_docs = chunks_or_sections_to_search_docs(top_sections)
|
||||||
|
|
||||||
|
# Deduping happens at the last step to avoid harming quality by dropping content early on
|
||||||
|
deduped_docs = top_docs
|
||||||
|
dropped_inds = None
|
||||||
|
if search_request.retrieval_options.dedupe_docs:
|
||||||
|
deduped_docs, dropped_inds = dedupe_documents(top_docs)
|
||||||
|
|
||||||
# No need to save the docs for this API
|
# No need to save the docs for this API
|
||||||
fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in top_docs]
|
fake_saved_docs = [SavedSearchDoc.from_search_doc(doc) for doc in deduped_docs]
|
||||||
|
|
||||||
|
if dropped_inds:
|
||||||
|
relevant_chunk_indices = drop_llm_indices(
|
||||||
|
llm_indices=relevant_chunk_indices,
|
||||||
|
search_docs=fake_saved_docs,
|
||||||
|
dropped_indices=dropped_inds,
|
||||||
|
)
|
||||||
|
|
||||||
return SearchResponse(
|
return SearchResponse(
|
||||||
top_documents=fake_saved_docs, llm_indices=relevant_chunk_indices
|
top_documents=fake_saved_docs, llm_indices=relevant_chunk_indices
|
||||||
|
Reference in New Issue
Block a user