From e390906ac1607d7b38197f438f2dcbe94ea014c8 Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Sat, 29 Apr 2023 16:49:27 -0700 Subject: [PATCH] Make Document source required (#4) --- backend/danswer/configs/constants.py | 4 ---- backend/danswer/connectors/google_drive/batch.py | 3 ++- backend/danswer/connectors/models.py | 3 +++ backend/danswer/connectors/slack/batch.py | 3 +++ backend/danswer/connectors/slack/pull.py | 2 ++ backend/danswer/connectors/web/batch.py | 3 ++- backend/danswer/datastores/qdrant/indexing.py | 4 +--- backend/tests/unit/qa_service/chunking/test_chunk.py | 4 +++- 8 files changed, 16 insertions(+), 10 deletions(-) diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 5813da77c..8570f205b 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -15,13 +15,9 @@ class DocumentSource(Enum): Slack = 1 Web = 2 GoogleDrive = 3 - Unknown = 4 def __str__(self): return self.name def __int__(self): return self.value - - -WEB_SOURCE = "Web" diff --git a/backend/danswer/connectors/google_drive/batch.py b/backend/danswer/connectors/google_drive/batch.py index 649bbb0c3..6c510358d 100644 --- a/backend/danswer/connectors/google_drive/batch.py +++ b/backend/danswer/connectors/google_drive/batch.py @@ -128,7 +128,8 @@ class BatchGoogleDriveLoader(BatchLoader): Document( id=file["webViewLink"], sections=[Section(link=file["webViewLink"], text=full_context)], - metadata={SOURCE_TYPE: DocumentSource.GoogleDrive}, + source=DocumentSource.GoogleDrive, + metadata={}, ) ) diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 0ff49ff1b..137b32d21 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Any +from danswer.configs.constants import DocumentSource + @dataclass class Section: @@ -12,6 +14,7 @@ class Section: class Document: id: str sections: list[Section] + source: DocumentSource metadata: dict[str, Any] diff --git a/backend/danswer/connectors/slack/batch.py b/backend/danswer/connectors/slack/batch.py index 55a9e8e52..4043ffae6 100644 --- a/backend/danswer/connectors/slack/batch.py +++ b/backend/danswer/connectors/slack/batch.py @@ -6,6 +6,7 @@ from typing import Any from typing import cast from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.connectors.slack.utils import get_message_link @@ -31,6 +32,7 @@ def _process_batch_event( text=event["text"], ) ], + source=matching_doc.source, metadata=matching_doc.metadata, ) @@ -44,6 +46,7 @@ def _process_batch_event( text=event["text"], ) ], + source=DocumentSource.Slack, metadata={}, ) diff --git a/backend/danswer/connectors/slack/pull.py b/backend/danswer/connectors/slack/pull.py index cccdc170b..7b3af9d3e 100644 --- a/backend/danswer/connectors/slack/pull.py +++ b/backend/danswer/connectors/slack/pull.py @@ -6,6 +6,7 @@ from typing import cast from typing import List from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.connectors.slack.utils import get_client @@ -173,6 +174,7 @@ def thread_to_doc(channel_id: str, thread: ThreadType) -> Document: ) for m in thread ], + source=DocumentSource.Slack, metadata={}, ) diff --git a/backend/danswer/connectors/web/batch.py b/backend/danswer/connectors/web/batch.py index 0c33c7979..e1596fc20 100644 --- a/backend/danswer/connectors/web/batch.py +++ b/backend/danswer/connectors/web/batch.py @@ -93,7 +93,8 @@ class BatchWebLoader(BatchLoader): Document( id=current_url, sections=[Section(link=current_url, text=page_text)], - metadata={SOURCE_TYPE: DocumentSource.Web}, + source=DocumentSource.Web, + metadata={}, ) ) diff --git a/backend/danswer/datastores/qdrant/indexing.py b/backend/danswer/datastores/qdrant/indexing.py index 90414db2f..b1d3753b0 100644 --- a/backend/danswer/datastores/qdrant/indexing.py +++ b/backend/danswer/datastores/qdrant/indexing.py @@ -53,9 +53,7 @@ def index_chunks( DOCUMENT_ID: document.id, CHUNK_ID: chunk.chunk_id, CONTENT: chunk.content, - SOURCE_TYPE: str( - document.metadata.get("source_type", DocumentSource.Unknown) - ), + SOURCE_TYPE: str(document.source), SOURCE_LINKS: chunk.source_links, SECTION_CONTINUATION: chunk.section_continuation, ALLOWED_USERS: [], # TODO diff --git a/backend/tests/unit/qa_service/chunking/test_chunk.py b/backend/tests/unit/qa_service/chunking/test_chunk.py index d4614f359..2e72da5ae 100644 --- a/backend/tests/unit/qa_service/chunking/test_chunk.py +++ b/backend/tests/unit/qa_service/chunking/test_chunk.py @@ -2,6 +2,7 @@ import unittest from danswer.chunking.chunk import chunk_document from danswer.chunking.chunk import chunk_large_section +from danswer.configs.constants import DocumentSource from danswer.connectors.models import Document from danswer.connectors.models import Section @@ -20,7 +21,6 @@ class TestDocumentChunking(unittest.TestCase): self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/") self.document = Document( id="test_document", - metadata={"source_type": "testing"}, sections=[ Section( text="Here is some testing text", link="https://www.test.com/0" @@ -39,6 +39,8 @@ class TestDocumentChunking(unittest.TestCase): text="should be combined into one", link="https://www.test.com/5" ), ], + source=DocumentSource.Web, # arbitrary picking web, doens't matter for this test + metadata={}, ) def test_chunk_large_section(self):