mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-17 21:32:36 +01:00
Make Document source required (#4)
This commit is contained in:
parent
f2d3d8269d
commit
e390906ac1
@ -15,13 +15,9 @@ class DocumentSource(Enum):
|
||||
Slack = 1
|
||||
Web = 2
|
||||
GoogleDrive = 3
|
||||
Unknown = 4
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __int__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
WEB_SOURCE = "Web"
|
||||
|
@ -128,7 +128,8 @@ class BatchGoogleDriveLoader(BatchLoader):
|
||||
Document(
|
||||
id=file["webViewLink"],
|
||||
sections=[Section(link=file["webViewLink"], text=full_context)],
|
||||
metadata={SOURCE_TYPE: DocumentSource.GoogleDrive},
|
||||
source=DocumentSource.GoogleDrive,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
|
||||
|
||||
@dataclass
|
||||
class Section:
|
||||
@ -12,6 +14,7 @@ class Section:
|
||||
class Document:
|
||||
id: str
|
||||
sections: list[Section]
|
||||
source: DocumentSource
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
|
@ -6,6 +6,7 @@ from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.slack.utils import get_message_link
|
||||
@ -31,6 +32,7 @@ def _process_batch_event(
|
||||
text=event["text"],
|
||||
)
|
||||
],
|
||||
source=matching_doc.source,
|
||||
metadata=matching_doc.metadata,
|
||||
)
|
||||
|
||||
@ -44,6 +46,7 @@ def _process_batch_event(
|
||||
text=event["text"],
|
||||
)
|
||||
],
|
||||
source=DocumentSource.Slack,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
|
@ -6,6 +6,7 @@ from typing import cast
|
||||
from typing import List
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.slack.utils import get_client
|
||||
@ -173,6 +174,7 @@ def thread_to_doc(channel_id: str, thread: ThreadType) -> Document:
|
||||
)
|
||||
for m in thread
|
||||
],
|
||||
source=DocumentSource.Slack,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
|
@ -93,7 +93,8 @@ class BatchWebLoader(BatchLoader):
|
||||
Document(
|
||||
id=current_url,
|
||||
sections=[Section(link=current_url, text=page_text)],
|
||||
metadata={SOURCE_TYPE: DocumentSource.Web},
|
||||
source=DocumentSource.Web,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -53,9 +53,7 @@ def index_chunks(
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
CONTENT: chunk.content,
|
||||
SOURCE_TYPE: str(
|
||||
document.metadata.get("source_type", DocumentSource.Unknown)
|
||||
),
|
||||
SOURCE_TYPE: str(document.source),
|
||||
SOURCE_LINKS: chunk.source_links,
|
||||
SECTION_CONTINUATION: chunk.section_continuation,
|
||||
ALLOWED_USERS: [], # TODO
|
||||
|
@ -2,6 +2,7 @@ import unittest
|
||||
|
||||
from danswer.chunking.chunk import chunk_document
|
||||
from danswer.chunking.chunk import chunk_large_section
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
|
||||
@ -20,7 +21,6 @@ class TestDocumentChunking(unittest.TestCase):
|
||||
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
|
||||
self.document = Document(
|
||||
id="test_document",
|
||||
metadata={"source_type": "testing"},
|
||||
sections=[
|
||||
Section(
|
||||
text="Here is some testing text", link="https://www.test.com/0"
|
||||
@ -39,6 +39,8 @@ class TestDocumentChunking(unittest.TestCase):
|
||||
text="should be combined into one", link="https://www.test.com/5"
|
||||
),
|
||||
],
|
||||
source=DocumentSource.Web, # arbitrary picking web, doens't matter for this test
|
||||
metadata={},
|
||||
)
|
||||
|
||||
def test_chunk_large_section(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user