Fix duplicate docs (#4378)

* Initial * Fix duplicate docs * Add tests * Switch to list comprehension * Fix test
2025-07-28 13:53:28 +02:00 · 2025-03-28 15:25:26 -07:00
parent 0d0588a0c1
commit 22e00a1f5c
11 changed files with 117 additions and 40 deletions
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -14,7 +14,6 @@ from ee.onyx.server.query_and_chat.models import (
    BasicCreateChatMessageWithHistoryRequest,
 )
 from ee.onyx.server.query_and_chat.models import ChatBasicResponse
-from ee.onyx.server.query_and_chat.models import SimpleDoc
 from onyx.auth.users import current_user
 from onyx.chat.chat_utils import combine_message_thread
 from onyx.chat.chat_utils import create_chat_chain
@@ -56,25 +55,6 @@ logger = setup_logger()
 router = APIRouter(prefix="/chat")


-def _translate_doc_response_to_simple_doc(
-    doc_response: QADocsResponse,
-) -> list[SimpleDoc]:
-    return [
-        SimpleDoc(
-            id=doc.document_id,
-            semantic_identifier=doc.semantic_identifier,
-            link=doc.link,
-            blurb=doc.blurb,
-            match_highlights=[
-                highlight for highlight in doc.match_highlights if highlight
-            ],
-            source_type=doc.source_type,
-            metadata=doc.metadata,
-        )
-        for doc in doc_response.top_documents
-    ]
-
-
 def _get_final_context_doc_indices(
    final_context_docs: list[LlmDoc] | None,
    top_docs: list[SavedSearchDoc] | None,
@@ -111,9 +91,6 @@ def _convert_packet_stream_to_response(
        elif isinstance(packet, QADocsResponse):
            response.top_documents = packet.top_documents

-            # TODO: deprecate `simple_search_docs`
-            response.simple_search_docs = _translate_doc_response_to_simple_doc(packet)
-
            # This is a no-op if agent_sub_questions hasn't already been filled
            if packet.level is not None and packet.level_question_num is not None:
                id = (packet.level, packet.level_question_num)
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -163,8 +163,6 @@ class ChatBasicResponse(BaseModel):
    cited_documents: dict[int, str] | None = None

    # FOR BACKWARDS COMPATIBILITY
-    # TODO: deprecate both of these
-    simple_search_docs: list[SimpleDoc] | None = None
    llm_chunks_indices: list[int] | None = None

    # agentic fields
--- a/backend/onyx/chat/prune_and_merge.py
+++ b/backend/onyx/chat/prune_and_merge.py
@@ -301,6 +301,10 @@ def prune_sections(


 def _merge_doc_chunks(chunks: list[InferenceChunk]) -> InferenceSection:
+    assert (
+        len(set([chunk.document_id for chunk in chunks])) == 1
+    ), "One distinct document must be passed into merge_doc_chunks"
+
    # Assuming there are no duplicates by this point
    sorted_chunks = sorted(chunks, key=lambda x: x.chunk_id)

--- a/backend/onyx/context/search/pipeline.py
+++ b/backend/onyx/context/search/pipeline.py
@@ -339,6 +339,12 @@ class SearchPipeline:
        self._retrieved_sections = self._get_sections()
        return self._retrieved_sections

+    @property
+    def merged_retrieved_sections(self) -> list[InferenceSection]:
+        """Should be used to display in the UI in order to prevent displaying
+        multiple sections for the same document as separate "documents"."""
+        return _merge_sections(sections=self.retrieved_sections)
+
    @property
    def reranked_sections(self) -> list[InferenceSection]:
        """Reranking is always done at the chunk level since section merging could create arbitrarily
--- a/backend/onyx/tools/tool_implementations/search/search_tool.py
+++ b/backend/onyx/tools/tool_implementations/search/search_tool.py
@@ -353,7 +353,8 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
        )
        yield from yield_search_responses(
            query=query,
-            get_retrieved_sections=lambda: search_pipeline.retrieved_sections,
+            # give back the merged sections to prevent duplicate docs from appearing in the UI
+            get_retrieved_sections=lambda: search_pipeline.merged_retrieved_sections,
            get_final_context_sections=lambda: search_pipeline.final_context_sections,
            search_query_info=search_query_info,
            get_section_relevance=lambda: search_pipeline.section_relevance,
--- a/backend/tests/integration/common_utils/managers/chat.py
+++ b/backend/tests/integration/common_utils/managers/chat.py
@@ -5,6 +5,7 @@ import requests
 from requests.models import Response

 from onyx.context.search.models import RetrievalDetails
+from onyx.context.search.models import SavedSearchDoc
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.override_models import PromptOverride
@@ -97,17 +98,24 @@ class ChatSessionManager:
        for data in response_data:
            if "rephrased_query" in data:
                analyzed.rephrased_query = data["rephrased_query"]
-            elif "tool_name" in data:
+            if "tool_name" in data:
                analyzed.tool_name = data["tool_name"]
                analyzed.tool_result = (
                    data.get("tool_result")
                    if analyzed.tool_name == "run_search"
                    else None
                )
-            elif "relevance_summaries" in data:
+            if "relevance_summaries" in data:
                analyzed.relevance_summaries = data["relevance_summaries"]
-            elif "answer_piece" in data and data["answer_piece"]:
+            if "answer_piece" in data and data["answer_piece"]:
                analyzed.full_message += data["answer_piece"]
+            if "top_documents" in data:
+                assert (
+                    analyzed.top_documents is None
+                ), "top_documents should only be set once"
+                analyzed.top_documents = [
+                    SavedSearchDoc(**doc) for doc in data["top_documents"]
+                ]

        return analyzed

--- a/backend/tests/integration/common_utils/test_models.py
+++ b/backend/tests/integration/common_utils/test_models.py
@@ -10,6 +10,7 @@ from pydantic import Field
 from onyx.auth.schemas import UserRole
 from onyx.configs.constants import QAFeedbackType
 from onyx.context.search.enums import RecencyBiasSetting
+from onyx.context.search.models import SavedSearchDoc
 from onyx.db.enums import AccessType
 from onyx.server.documents.models import DocumentSource
 from onyx.server.documents.models import IndexAttemptSnapshot
@@ -157,7 +158,7 @@ class StreamedResponse(BaseModel):
    full_message: str = ""
    rephrased_query: str | None = None
    tool_name: str | None = None
-    top_documents: list[dict[str, Any]] | None = None
+    top_documents: list[SavedSearchDoc] | None = None
    relevance_summaries: list[dict[str, Any]] | None = None
    tool_result: Any | None = None
    user: str | None = None
--- a/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py
+++ b/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py
@@ -16,10 +16,7 @@ from tests.integration.common_utils.test_models import DATestCCPair
 from tests.integration.common_utils.test_models import DATestUser


-def test_send_message_simple_with_history(reset: None) -> None:
-    # Creating an admin user (first user created is automatically an admin)
-    admin_user: DATestUser = UserManager.create(name="admin_user")
-
+def test_send_message_simple_with_history(reset: None, admin_user: DATestUser) -> None:
    # create connectors
    cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch(
        user_performing_action=admin_user,
@@ -53,13 +50,13 @@ def test_send_message_simple_with_history(reset: None) -> None:
    response_json = response.json()

    # Check that the top document is the correct document
-    assert response_json["simple_search_docs"][0]["id"] == cc_pair_1.documents[0].id
    assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[0].id

    # assert that the metadata is correct
    for doc in cc_pair_1.documents:
        found_doc = next(
-            (x for x in response_json["simple_search_docs"] if x["id"] == doc.id), None
+            (x for x in response_json["top_documents"] if x["document_id"] == doc.id),
+            None,
        )
        assert found_doc
        assert found_doc["metadata"]["document_id"] == doc.id
--- a/backend/tests/integration/tests/streaming_endpoints/conftest.py
+++ b/backend/tests/integration/tests/streaming_endpoints/conftest.py
@@ -0,0 +1,42 @@
+from collections.abc import Callable
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from tests.integration.common_utils.managers.api_key import APIKeyManager
+from tests.integration.common_utils.managers.cc_pair import CCPairManager
+from tests.integration.common_utils.managers.document import DocumentManager
+from tests.integration.common_utils.test_models import DATestAPIKey
+from tests.integration.common_utils.test_models import DATestUser
+from tests.integration.common_utils.test_models import SimpleTestDocument
+
+
+DocumentBuilderType = Callable[[list[str]], list[SimpleTestDocument]]
+
+
+@pytest.fixture
+def document_builder(admin_user: DATestUser) -> DocumentBuilderType:
+    api_key: DATestAPIKey = APIKeyManager.create(
+        user_performing_action=admin_user,
+    )
+
+    # create connector
+    cc_pair_1 = CCPairManager.create_from_scratch(
+        source=DocumentSource.INGESTION_API,
+        user_performing_action=admin_user,
+    )
+
+    def _document_builder(contents: list[str]) -> list[SimpleTestDocument]:
+        # seed documents
+        docs: list[SimpleTestDocument] = [
+            DocumentManager.seed_doc_with_content(
+                cc_pair=cc_pair_1,
+                content=content,
+                api_key=api_key,
+            )
+            for content in contents
+        ]
+
+        return docs
+
+    return _document_builder
--- a/backend/tests/integration/tests/streaming_endpoints/test_chat_stream.py
+++ b/backend/tests/integration/tests/streaming_endpoints/test_chat_stream.py
@@ -5,12 +5,11 @@ import pytest
 from tests.integration.common_utils.constants import API_SERVER_URL
 from tests.integration.common_utils.managers.chat import ChatSessionManager
 from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
-from tests.integration.common_utils.managers.user import UserManager
 from tests.integration.common_utils.test_models import DATestUser
+from tests.integration.tests.streaming_endpoints.conftest import DocumentBuilderType


-def test_send_message_simple_with_history(reset: None) -> None:
-    admin_user: DATestUser = UserManager.create(name="admin_user")
+def test_send_message_simple_with_history(reset: None, admin_user: DATestUser) -> None:
    LLMProviderManager.create(user_performing_action=admin_user)

    test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
@@ -24,6 +23,44 @@ def test_send_message_simple_with_history(reset: None) -> None:
    assert len(response.full_message) > 0


+def test_send_message__basic_searches(
+    reset: None, admin_user: DATestUser, document_builder: DocumentBuilderType
+) -> None:
+    MESSAGE = "run a search for 'test'"
+    SHORT_DOC_CONTENT = "test"
+    LONG_DOC_CONTENT = "blah blah blah blah" * 100
+
+    LLMProviderManager.create(user_performing_action=admin_user)
+
+    short_doc = document_builder([SHORT_DOC_CONTENT])[0]
+
+    test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
+    response = ChatSessionManager.send_message(
+        chat_session_id=test_chat_session.id,
+        message=MESSAGE,
+        user_performing_action=admin_user,
+    )
+    assert response.top_documents is not None
+    assert len(response.top_documents) == 1
+    assert response.top_documents[0].document_id == short_doc.id
+
+    # make sure this doc is really long so that it will be split into multiple chunks
+    long_doc = document_builder([LONG_DOC_CONTENT])[0]
+
+    # new chat session for simplicity
+    test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
+    response = ChatSessionManager.send_message(
+        chat_session_id=test_chat_session.id,
+        message=MESSAGE,
+        user_performing_action=admin_user,
+    )
+    assert response.top_documents is not None
+    assert len(response.top_documents) == 2
+    # short doc should be more relevant and thus first
+    assert response.top_documents[0].document_id == short_doc.id
+    assert response.top_documents[1].document_id == long_doc.id
+
+
@pytest.mark.skip(
    reason="enable for autorun when we have a testing environment with semantically useful data"
 )
--- a/web/src/components/search/DocumentDisplay.tsx
+++ b/web/src/components/search/DocumentDisplay.tsx
@@ -26,7 +26,13 @@ export const buildDocumentSummaryDisplay = (
  matchHighlights: string[],
  blurb: string
 ) => {
-  if (!matchHighlights || matchHighlights.length === 0) {
+  // if there are no match highlights, or if it's really short, just use the blurb
+  // this is to prevent the UI from showing something like `...` for the summary
+  const MIN_MATCH_HIGHLIGHT_LENGTH = 5;
+  if (
+    !matchHighlights ||
+    matchHighlights.length <= MIN_MATCH_HIGHLIGHT_LENGTH
+  ) {
    return blurb;
  }