Make Document source required (#4)

2025-03-17 21:32:36 +01:00 · 2023-04-29 16:49:27 -07:00 · 2023-04-29 16:49:27 -07:00 · e390906ac1
commit e390906ac1
parent f2d3d8269d
8 changed files with 16 additions and 10 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -15,13 +15,9 @@ class DocumentSource(Enum):
    Slack = 1
    Web = 2
    GoogleDrive = 3
-    Unknown = 4

    def __str__(self):
        return self.name

    def __int__(self):
        return self.value
-
-
-WEB_SOURCE = "Web"
--- a/backend/danswer/connectors/google_drive/batch.py
+++ b/backend/danswer/connectors/google_drive/batch.py
@ -128,7 +128,8 @@ class BatchGoogleDriveLoader(BatchLoader):
                    Document(
                        id=file["webViewLink"],
                        sections=[Section(link=file["webViewLink"], text=full_context)],
-                        metadata={SOURCE_TYPE: DocumentSource.GoogleDrive},
+                        source=DocumentSource.GoogleDrive,
+                        metadata={},
                    )
                )

--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from typing import Any

+from danswer.configs.constants import DocumentSource
+

@dataclass
 class Section:
@ -12,6 +14,7 @@ class Section:
 class Document:
    id: str
    sections: list[Section]
+    source: DocumentSource
    metadata: dict[str, Any]


--- a/backend/danswer/connectors/slack/batch.py
+++ b/backend/danswer/connectors/slack/batch.py
@ -6,6 +6,7 @@ from typing import Any
 from typing import cast

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.connectors.slack.utils import get_message_link
@ -31,6 +32,7 @@ def _process_batch_event(
                        text=event["text"],
                    )
                ],
+                source=matching_doc.source,
                metadata=matching_doc.metadata,
            )

@ -44,6 +46,7 @@ def _process_batch_event(
                    text=event["text"],
                )
            ],
+            source=DocumentSource.Slack,
            metadata={},
        )

--- a/backend/danswer/connectors/slack/pull.py
+++ b/backend/danswer/connectors/slack/pull.py
@ -6,6 +6,7 @@ from typing import cast
 from typing import List

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.connectors.slack.utils import get_client
@ -173,6 +174,7 @@ def thread_to_doc(channel_id: str, thread: ThreadType) -> Document:
            )
            for m in thread
        ],
+        source=DocumentSource.Slack,
        metadata={},
    )

--- a/backend/danswer/connectors/web/batch.py
+++ b/backend/danswer/connectors/web/batch.py
@ -93,7 +93,8 @@ class BatchWebLoader(BatchLoader):
                        Document(
                            id=current_url,
                            sections=[Section(link=current_url, text=page_text)],
-                            metadata={SOURCE_TYPE: DocumentSource.Web},
+                            source=DocumentSource.Web,
+                            metadata={},
                        )
                    )

--- a/backend/danswer/datastores/qdrant/indexing.py
+++ b/backend/danswer/datastores/qdrant/indexing.py
@ -53,9 +53,7 @@ def index_chunks(
                    DOCUMENT_ID: document.id,
                    CHUNK_ID: chunk.chunk_id,
                    CONTENT: chunk.content,
-                    SOURCE_TYPE: str(
-                        document.metadata.get("source_type", DocumentSource.Unknown)
-                    ),
+                    SOURCE_TYPE: str(document.source),
                    SOURCE_LINKS: chunk.source_links,
                    SECTION_CONTINUATION: chunk.section_continuation,
                    ALLOWED_USERS: [],  # TODO
--- a/backend/tests/unit/qa_service/chunking/test_chunk.py
+++ b/backend/tests/unit/qa_service/chunking/test_chunk.py
@ -2,6 +2,7 @@ import unittest

 from danswer.chunking.chunk import chunk_document
 from danswer.chunking.chunk import chunk_large_section
+from danswer.configs.constants import DocumentSource
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section

@ -20,7 +21,6 @@ class TestDocumentChunking(unittest.TestCase):
        self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
        self.document = Document(
            id="test_document",
-            metadata={"source_type": "testing"},
            sections=[
                Section(
                    text="Here is some testing text", link="https://www.test.com/0"
@ -39,6 +39,8 @@ class TestDocumentChunking(unittest.TestCase):
                    text="should be combined into one", link="https://www.test.com/5"
                ),
            ],
+            source=DocumentSource.Web,  # arbitrary picking web, doens't matter for this test
+            metadata={},
        )

    def test_chunk_large_section(self):