Make Document source required (#4)

This commit is contained in:
Chris Weaver 2023-04-29 16:49:27 -07:00 committed by GitHub
parent f2d3d8269d
commit e390906ac1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 16 additions and 10 deletions

View File

@ -15,13 +15,9 @@ class DocumentSource(Enum):
Slack = 1
Web = 2
GoogleDrive = 3
Unknown = 4
def __str__(self):
return self.name
def __int__(self):
return self.value
WEB_SOURCE = "Web"

View File

@ -128,7 +128,8 @@ class BatchGoogleDriveLoader(BatchLoader):
Document(
id=file["webViewLink"],
sections=[Section(link=file["webViewLink"], text=full_context)],
metadata={SOURCE_TYPE: DocumentSource.GoogleDrive},
source=DocumentSource.GoogleDrive,
metadata={},
)
)

View File

@ -1,6 +1,8 @@
from dataclasses import dataclass
from typing import Any
from danswer.configs.constants import DocumentSource
@dataclass
class Section:
@ -12,6 +14,7 @@ class Section:
class Document:
id: str
sections: list[Section]
source: DocumentSource
metadata: dict[str, Any]

View File

@ -6,6 +6,7 @@ from typing import Any
from typing import cast
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.slack.utils import get_message_link
@ -31,6 +32,7 @@ def _process_batch_event(
text=event["text"],
)
],
source=matching_doc.source,
metadata=matching_doc.metadata,
)
@ -44,6 +46,7 @@ def _process_batch_event(
text=event["text"],
)
],
source=DocumentSource.Slack,
metadata={},
)

View File

@ -6,6 +6,7 @@ from typing import cast
from typing import List
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.slack.utils import get_client
@ -173,6 +174,7 @@ def thread_to_doc(channel_id: str, thread: ThreadType) -> Document:
)
for m in thread
],
source=DocumentSource.Slack,
metadata={},
)

View File

@ -93,7 +93,8 @@ class BatchWebLoader(BatchLoader):
Document(
id=current_url,
sections=[Section(link=current_url, text=page_text)],
metadata={SOURCE_TYPE: DocumentSource.Web},
source=DocumentSource.Web,
metadata={},
)
)

View File

@ -53,9 +53,7 @@ def index_chunks(
DOCUMENT_ID: document.id,
CHUNK_ID: chunk.chunk_id,
CONTENT: chunk.content,
SOURCE_TYPE: str(
document.metadata.get("source_type", DocumentSource.Unknown)
),
SOURCE_TYPE: str(document.source),
SOURCE_LINKS: chunk.source_links,
SECTION_CONTINUATION: chunk.section_continuation,
ALLOWED_USERS: [], # TODO

View File

@ -2,6 +2,7 @@ import unittest
from danswer.chunking.chunk import chunk_document
from danswer.chunking.chunk import chunk_large_section
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
@ -20,7 +21,6 @@ class TestDocumentChunking(unittest.TestCase):
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
self.document = Document(
id="test_document",
metadata={"source_type": "testing"},
sections=[
Section(
text="Here is some testing text", link="https://www.test.com/0"
@ -39,6 +39,8 @@ class TestDocumentChunking(unittest.TestCase):
text="should be combined into one", link="https://www.test.com/5"
),
],
source=DocumentSource.Web, # arbitrary picking web, doens't matter for this test
metadata={},
)
def test_chunk_large_section(self):