mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-20 13:05:49 +02:00
Add more metadata options for File connector (#797)
This commit is contained in:
@@ -12,6 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
|||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||||
|
from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||||
from danswer.connectors.file.utils import get_file_ext
|
from danswer.connectors.file.utils import get_file_ext
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
@@ -63,15 +64,20 @@ def _process_file(
|
|||||||
else:
|
else:
|
||||||
file_content_raw, metadata = read_file(file)
|
file_content_raw, metadata = read_file(file)
|
||||||
|
|
||||||
|
dt_str = metadata.get("doc_updated_at")
|
||||||
|
final_time_updated = time_str_to_utc(dt_str) if dt_str else time_updated
|
||||||
|
|
||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
id=file_name,
|
id=file_name,
|
||||||
sections=[
|
sections=[
|
||||||
Section(link=metadata.get("link", None), text=file_content_raw.strip())
|
Section(link=metadata.get("link"), text=file_content_raw.strip())
|
||||||
],
|
],
|
||||||
source=DocumentSource.FILE,
|
source=DocumentSource.FILE,
|
||||||
semantic_identifier=file_name,
|
semantic_identifier=file_name,
|
||||||
doc_updated_at=time_updated,
|
doc_updated_at=final_time_updated,
|
||||||
|
primary_owners=metadata.get("primary_owners"),
|
||||||
|
secondary_owners=metadata.get("secondary_owners"),
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
Reference in New Issue
Block a user