mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 12:29:41 +02:00
Make Document source required (#4)
This commit is contained in:
@@ -15,13 +15,9 @@ class DocumentSource(Enum):
|
|||||||
Slack = 1
|
Slack = 1
|
||||||
Web = 2
|
Web = 2
|
||||||
GoogleDrive = 3
|
GoogleDrive = 3
|
||||||
Unknown = 4
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
def __int__(self):
|
def __int__(self):
|
||||||
return self.value
|
return self.value
|
||||||
|
|
||||||
|
|
||||||
WEB_SOURCE = "Web"
|
|
||||||
|
@@ -128,7 +128,8 @@ class BatchGoogleDriveLoader(BatchLoader):
|
|||||||
Document(
|
Document(
|
||||||
id=file["webViewLink"],
|
id=file["webViewLink"],
|
||||||
sections=[Section(link=file["webViewLink"], text=full_context)],
|
sections=[Section(link=file["webViewLink"], text=full_context)],
|
||||||
metadata={SOURCE_TYPE: DocumentSource.GoogleDrive},
|
source=DocumentSource.GoogleDrive,
|
||||||
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Section:
|
class Section:
|
||||||
@@ -12,6 +14,7 @@ class Section:
|
|||||||
class Document:
|
class Document:
|
||||||
id: str
|
id: str
|
||||||
sections: list[Section]
|
sections: list[Section]
|
||||||
|
source: DocumentSource
|
||||||
metadata: dict[str, Any]
|
metadata: dict[str, Any]
|
||||||
|
|
||||||
|
|
||||||
|
@@ -6,6 +6,7 @@ from typing import Any
|
|||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.connectors.slack.utils import get_message_link
|
from danswer.connectors.slack.utils import get_message_link
|
||||||
@@ -31,6 +32,7 @@ def _process_batch_event(
|
|||||||
text=event["text"],
|
text=event["text"],
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
|
source=matching_doc.source,
|
||||||
metadata=matching_doc.metadata,
|
metadata=matching_doc.metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -44,6 +46,7 @@ def _process_batch_event(
|
|||||||
text=event["text"],
|
text=event["text"],
|
||||||
)
|
)
|
||||||
],
|
],
|
||||||
|
source=DocumentSource.Slack,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -6,6 +6,7 @@ from typing import cast
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.connectors.slack.utils import get_client
|
from danswer.connectors.slack.utils import get_client
|
||||||
@@ -173,6 +174,7 @@ def thread_to_doc(channel_id: str, thread: ThreadType) -> Document:
|
|||||||
)
|
)
|
||||||
for m in thread
|
for m in thread
|
||||||
],
|
],
|
||||||
|
source=DocumentSource.Slack,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -93,7 +93,8 @@ class BatchWebLoader(BatchLoader):
|
|||||||
Document(
|
Document(
|
||||||
id=current_url,
|
id=current_url,
|
||||||
sections=[Section(link=current_url, text=page_text)],
|
sections=[Section(link=current_url, text=page_text)],
|
||||||
metadata={SOURCE_TYPE: DocumentSource.Web},
|
source=DocumentSource.Web,
|
||||||
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -53,9 +53,7 @@ def index_chunks(
|
|||||||
DOCUMENT_ID: document.id,
|
DOCUMENT_ID: document.id,
|
||||||
CHUNK_ID: chunk.chunk_id,
|
CHUNK_ID: chunk.chunk_id,
|
||||||
CONTENT: chunk.content,
|
CONTENT: chunk.content,
|
||||||
SOURCE_TYPE: str(
|
SOURCE_TYPE: str(document.source),
|
||||||
document.metadata.get("source_type", DocumentSource.Unknown)
|
|
||||||
),
|
|
||||||
SOURCE_LINKS: chunk.source_links,
|
SOURCE_LINKS: chunk.source_links,
|
||||||
SECTION_CONTINUATION: chunk.section_continuation,
|
SECTION_CONTINUATION: chunk.section_continuation,
|
||||||
ALLOWED_USERS: [], # TODO
|
ALLOWED_USERS: [], # TODO
|
||||||
|
@@ -2,6 +2,7 @@ import unittest
|
|||||||
|
|
||||||
from danswer.chunking.chunk import chunk_document
|
from danswer.chunking.chunk import chunk_document
|
||||||
from danswer.chunking.chunk import chunk_large_section
|
from danswer.chunking.chunk import chunk_large_section
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
|
||||||
@@ -20,7 +21,6 @@ class TestDocumentChunking(unittest.TestCase):
|
|||||||
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
|
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
|
||||||
self.document = Document(
|
self.document = Document(
|
||||||
id="test_document",
|
id="test_document",
|
||||||
metadata={"source_type": "testing"},
|
|
||||||
sections=[
|
sections=[
|
||||||
Section(
|
Section(
|
||||||
text="Here is some testing text", link="https://www.test.com/0"
|
text="Here is some testing text", link="https://www.test.com/0"
|
||||||
@@ -39,6 +39,8 @@ class TestDocumentChunking(unittest.TestCase):
|
|||||||
text="should be combined into one", link="https://www.test.com/5"
|
text="should be combined into one", link="https://www.test.com/5"
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
source=DocumentSource.Web, # arbitrary picking web, doens't matter for this test
|
||||||
|
metadata={},
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_chunk_large_section(self):
|
def test_chunk_large_section(self):
|
||||||
|
Reference in New Issue
Block a user