mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-28 13:53:28 +02:00
Fix duplicate docs (#4378)
* Initial * Fix duplicate docs * Add tests * Switch to list comprehension * Fix test
This commit is contained in:
@@ -14,7 +14,6 @@ from ee.onyx.server.query_and_chat.models import (
|
||||
BasicCreateChatMessageWithHistoryRequest,
|
||||
)
|
||||
from ee.onyx.server.query_and_chat.models import ChatBasicResponse
|
||||
from ee.onyx.server.query_and_chat.models import SimpleDoc
|
||||
from onyx.auth.users import current_user
|
||||
from onyx.chat.chat_utils import combine_message_thread
|
||||
from onyx.chat.chat_utils import create_chat_chain
|
||||
@@ -56,25 +55,6 @@ logger = setup_logger()
|
||||
router = APIRouter(prefix="/chat")
|
||||
|
||||
|
||||
def _translate_doc_response_to_simple_doc(
|
||||
doc_response: QADocsResponse,
|
||||
) -> list[SimpleDoc]:
|
||||
return [
|
||||
SimpleDoc(
|
||||
id=doc.document_id,
|
||||
semantic_identifier=doc.semantic_identifier,
|
||||
link=doc.link,
|
||||
blurb=doc.blurb,
|
||||
match_highlights=[
|
||||
highlight for highlight in doc.match_highlights if highlight
|
||||
],
|
||||
source_type=doc.source_type,
|
||||
metadata=doc.metadata,
|
||||
)
|
||||
for doc in doc_response.top_documents
|
||||
]
|
||||
|
||||
|
||||
def _get_final_context_doc_indices(
|
||||
final_context_docs: list[LlmDoc] | None,
|
||||
top_docs: list[SavedSearchDoc] | None,
|
||||
@@ -111,9 +91,6 @@ def _convert_packet_stream_to_response(
|
||||
elif isinstance(packet, QADocsResponse):
|
||||
response.top_documents = packet.top_documents
|
||||
|
||||
# TODO: deprecate `simple_search_docs`
|
||||
response.simple_search_docs = _translate_doc_response_to_simple_doc(packet)
|
||||
|
||||
# This is a no-op if agent_sub_questions hasn't already been filled
|
||||
if packet.level is not None and packet.level_question_num is not None:
|
||||
id = (packet.level, packet.level_question_num)
|
||||
|
@@ -163,8 +163,6 @@ class ChatBasicResponse(BaseModel):
|
||||
cited_documents: dict[int, str] | None = None
|
||||
|
||||
# FOR BACKWARDS COMPATIBILITY
|
||||
# TODO: deprecate both of these
|
||||
simple_search_docs: list[SimpleDoc] | None = None
|
||||
llm_chunks_indices: list[int] | None = None
|
||||
|
||||
# agentic fields
|
||||
|
@@ -301,6 +301,10 @@ def prune_sections(
|
||||
|
||||
|
||||
def _merge_doc_chunks(chunks: list[InferenceChunk]) -> InferenceSection:
|
||||
assert (
|
||||
len(set([chunk.document_id for chunk in chunks])) == 1
|
||||
), "One distinct document must be passed into merge_doc_chunks"
|
||||
|
||||
# Assuming there are no duplicates by this point
|
||||
sorted_chunks = sorted(chunks, key=lambda x: x.chunk_id)
|
||||
|
||||
|
@@ -339,6 +339,12 @@ class SearchPipeline:
|
||||
self._retrieved_sections = self._get_sections()
|
||||
return self._retrieved_sections
|
||||
|
||||
@property
|
||||
def merged_retrieved_sections(self) -> list[InferenceSection]:
|
||||
"""Should be used to display in the UI in order to prevent displaying
|
||||
multiple sections for the same document as separate "documents"."""
|
||||
return _merge_sections(sections=self.retrieved_sections)
|
||||
|
||||
@property
|
||||
def reranked_sections(self) -> list[InferenceSection]:
|
||||
"""Reranking is always done at the chunk level since section merging could create arbitrarily
|
||||
|
@@ -353,7 +353,8 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
|
||||
)
|
||||
yield from yield_search_responses(
|
||||
query=query,
|
||||
get_retrieved_sections=lambda: search_pipeline.retrieved_sections,
|
||||
# give back the merged sections to prevent duplicate docs from appearing in the UI
|
||||
get_retrieved_sections=lambda: search_pipeline.merged_retrieved_sections,
|
||||
get_final_context_sections=lambda: search_pipeline.final_context_sections,
|
||||
search_query_info=search_query_info,
|
||||
get_section_relevance=lambda: search_pipeline.section_relevance,
|
||||
|
@@ -5,6 +5,7 @@ import requests
|
||||
from requests.models import Response
|
||||
|
||||
from onyx.context.search.models import RetrievalDetails
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.file_store.models import FileDescriptor
|
||||
from onyx.llm.override_models import LLMOverride
|
||||
from onyx.llm.override_models import PromptOverride
|
||||
@@ -97,17 +98,24 @@ class ChatSessionManager:
|
||||
for data in response_data:
|
||||
if "rephrased_query" in data:
|
||||
analyzed.rephrased_query = data["rephrased_query"]
|
||||
elif "tool_name" in data:
|
||||
if "tool_name" in data:
|
||||
analyzed.tool_name = data["tool_name"]
|
||||
analyzed.tool_result = (
|
||||
data.get("tool_result")
|
||||
if analyzed.tool_name == "run_search"
|
||||
else None
|
||||
)
|
||||
elif "relevance_summaries" in data:
|
||||
if "relevance_summaries" in data:
|
||||
analyzed.relevance_summaries = data["relevance_summaries"]
|
||||
elif "answer_piece" in data and data["answer_piece"]:
|
||||
if "answer_piece" in data and data["answer_piece"]:
|
||||
analyzed.full_message += data["answer_piece"]
|
||||
if "top_documents" in data:
|
||||
assert (
|
||||
analyzed.top_documents is None
|
||||
), "top_documents should only be set once"
|
||||
analyzed.top_documents = [
|
||||
SavedSearchDoc(**doc) for doc in data["top_documents"]
|
||||
]
|
||||
|
||||
return analyzed
|
||||
|
||||
|
@@ -10,6 +10,7 @@ from pydantic import Field
|
||||
from onyx.auth.schemas import UserRole
|
||||
from onyx.configs.constants import QAFeedbackType
|
||||
from onyx.context.search.enums import RecencyBiasSetting
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.server.documents.models import DocumentSource
|
||||
from onyx.server.documents.models import IndexAttemptSnapshot
|
||||
@@ -157,7 +158,7 @@ class StreamedResponse(BaseModel):
|
||||
full_message: str = ""
|
||||
rephrased_query: str | None = None
|
||||
tool_name: str | None = None
|
||||
top_documents: list[dict[str, Any]] | None = None
|
||||
top_documents: list[SavedSearchDoc] | None = None
|
||||
relevance_summaries: list[dict[str, Any]] | None = None
|
||||
tool_result: Any | None = None
|
||||
user: str | None = None
|
||||
|
@@ -16,10 +16,7 @@ from tests.integration.common_utils.test_models import DATestCCPair
|
||||
from tests.integration.common_utils.test_models import DATestUser
|
||||
|
||||
|
||||
def test_send_message_simple_with_history(reset: None) -> None:
|
||||
# Creating an admin user (first user created is automatically an admin)
|
||||
admin_user: DATestUser = UserManager.create(name="admin_user")
|
||||
|
||||
def test_send_message_simple_with_history(reset: None, admin_user: DATestUser) -> None:
|
||||
# create connectors
|
||||
cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch(
|
||||
user_performing_action=admin_user,
|
||||
@@ -53,13 +50,13 @@ def test_send_message_simple_with_history(reset: None) -> None:
|
||||
response_json = response.json()
|
||||
|
||||
# Check that the top document is the correct document
|
||||
assert response_json["simple_search_docs"][0]["id"] == cc_pair_1.documents[0].id
|
||||
assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[0].id
|
||||
|
||||
# assert that the metadata is correct
|
||||
for doc in cc_pair_1.documents:
|
||||
found_doc = next(
|
||||
(x for x in response_json["simple_search_docs"] if x["id"] == doc.id), None
|
||||
(x for x in response_json["top_documents"] if x["document_id"] == doc.id),
|
||||
None,
|
||||
)
|
||||
assert found_doc
|
||||
assert found_doc["metadata"]["document_id"] == doc.id
|
||||
|
@@ -0,0 +1,42 @@
|
||||
from collections.abc import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from tests.integration.common_utils.managers.api_key import APIKeyManager
|
||||
from tests.integration.common_utils.managers.cc_pair import CCPairManager
|
||||
from tests.integration.common_utils.managers.document import DocumentManager
|
||||
from tests.integration.common_utils.test_models import DATestAPIKey
|
||||
from tests.integration.common_utils.test_models import DATestUser
|
||||
from tests.integration.common_utils.test_models import SimpleTestDocument
|
||||
|
||||
|
||||
DocumentBuilderType = Callable[[list[str]], list[SimpleTestDocument]]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_builder(admin_user: DATestUser) -> DocumentBuilderType:
|
||||
api_key: DATestAPIKey = APIKeyManager.create(
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# create connector
|
||||
cc_pair_1 = CCPairManager.create_from_scratch(
|
||||
source=DocumentSource.INGESTION_API,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
def _document_builder(contents: list[str]) -> list[SimpleTestDocument]:
|
||||
# seed documents
|
||||
docs: list[SimpleTestDocument] = [
|
||||
DocumentManager.seed_doc_with_content(
|
||||
cc_pair=cc_pair_1,
|
||||
content=content,
|
||||
api_key=api_key,
|
||||
)
|
||||
for content in contents
|
||||
]
|
||||
|
||||
return docs
|
||||
|
||||
return _document_builder
|
@@ -5,12 +5,11 @@ import pytest
|
||||
from tests.integration.common_utils.constants import API_SERVER_URL
|
||||
from tests.integration.common_utils.managers.chat import ChatSessionManager
|
||||
from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
|
||||
from tests.integration.common_utils.managers.user import UserManager
|
||||
from tests.integration.common_utils.test_models import DATestUser
|
||||
from tests.integration.tests.streaming_endpoints.conftest import DocumentBuilderType
|
||||
|
||||
|
||||
def test_send_message_simple_with_history(reset: None) -> None:
|
||||
admin_user: DATestUser = UserManager.create(name="admin_user")
|
||||
def test_send_message_simple_with_history(reset: None, admin_user: DATestUser) -> None:
|
||||
LLMProviderManager.create(user_performing_action=admin_user)
|
||||
|
||||
test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
|
||||
@@ -24,6 +23,44 @@ def test_send_message_simple_with_history(reset: None) -> None:
|
||||
assert len(response.full_message) > 0
|
||||
|
||||
|
||||
def test_send_message__basic_searches(
|
||||
reset: None, admin_user: DATestUser, document_builder: DocumentBuilderType
|
||||
) -> None:
|
||||
MESSAGE = "run a search for 'test'"
|
||||
SHORT_DOC_CONTENT = "test"
|
||||
LONG_DOC_CONTENT = "blah blah blah blah" * 100
|
||||
|
||||
LLMProviderManager.create(user_performing_action=admin_user)
|
||||
|
||||
short_doc = document_builder([SHORT_DOC_CONTENT])[0]
|
||||
|
||||
test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
|
||||
response = ChatSessionManager.send_message(
|
||||
chat_session_id=test_chat_session.id,
|
||||
message=MESSAGE,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
assert response.top_documents is not None
|
||||
assert len(response.top_documents) == 1
|
||||
assert response.top_documents[0].document_id == short_doc.id
|
||||
|
||||
# make sure this doc is really long so that it will be split into multiple chunks
|
||||
long_doc = document_builder([LONG_DOC_CONTENT])[0]
|
||||
|
||||
# new chat session for simplicity
|
||||
test_chat_session = ChatSessionManager.create(user_performing_action=admin_user)
|
||||
response = ChatSessionManager.send_message(
|
||||
chat_session_id=test_chat_session.id,
|
||||
message=MESSAGE,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
assert response.top_documents is not None
|
||||
assert len(response.top_documents) == 2
|
||||
# short doc should be more relevant and thus first
|
||||
assert response.top_documents[0].document_id == short_doc.id
|
||||
assert response.top_documents[1].document_id == long_doc.id
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="enable for autorun when we have a testing environment with semantically useful data"
|
||||
)
|
||||
|
@@ -26,7 +26,13 @@ export const buildDocumentSummaryDisplay = (
|
||||
matchHighlights: string[],
|
||||
blurb: string
|
||||
) => {
|
||||
if (!matchHighlights || matchHighlights.length === 0) {
|
||||
// if there are no match highlights, or if it's really short, just use the blurb
|
||||
// this is to prevent the UI from showing something like `...` for the summary
|
||||
const MIN_MATCH_HIGHLIGHT_LENGTH = 5;
|
||||
if (
|
||||
!matchHighlights ||
|
||||
matchHighlights.length <= MIN_MATCH_HIGHLIGHT_LENGTH
|
||||
) {
|
||||
return blurb;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user