mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-27 08:21:00 +02:00
added support for docx in gdrive
rebase from main
This commit is contained in:
parent
d6ca865034
commit
a4b47e0243
@ -1,5 +1,6 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import io
|
import io
|
||||||
|
import tempfile
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
@ -31,8 +32,9 @@ SCOPES = [
|
|||||||
]
|
]
|
||||||
SUPPORTED_DRIVE_DOC_TYPES = [
|
SUPPORTED_DRIVE_DOC_TYPES = [
|
||||||
"application/vnd.google-apps.document",
|
"application/vnd.google-apps.document",
|
||||||
"application/pdf",
|
|
||||||
"application/vnd.google-apps.spreadsheet",
|
"application/vnd.google-apps.spreadsheet",
|
||||||
|
"application/pdf",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
]
|
]
|
||||||
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
|
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
|
||||||
ID_KEY = "id"
|
ID_KEY = "id"
|
||||||
@ -46,7 +48,10 @@ def get_folder_id(
|
|||||||
"""
|
"""
|
||||||
Get the ID of a folder given its name and the ID of its parent folder.
|
Get the ID of a folder given its name and the ID of its parent folder.
|
||||||
"""
|
"""
|
||||||
query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'"
|
query = (
|
||||||
|
f"'{parent_id}' in parents and name='{folder_name}' and "
|
||||||
|
f"mimeType='{DRIVE_FOLDER_TYPE}'"
|
||||||
|
)
|
||||||
results = (
|
results = (
|
||||||
service.files()
|
service.files()
|
||||||
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
|
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
|
||||||
@ -140,6 +145,19 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
|||||||
.decode("utf-8")
|
.decode("utf-8")
|
||||||
)
|
)
|
||||||
# Default download to PDF since most types can be exported as a PDF
|
# Default download to PDF since most types can be exported as a PDF
|
||||||
|
elif (
|
||||||
|
mime_type
|
||||||
|
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
):
|
||||||
|
import docx2txt
|
||||||
|
|
||||||
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
|
word_stream = io.BytesIO(response)
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
||||||
|
temp.write(word_stream.getvalue())
|
||||||
|
temp_path = temp.name
|
||||||
|
return docx2txt.process(temp_path)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
pdf_stream = io.BytesIO(response)
|
pdf_stream = io.BytesIO(response)
|
||||||
@ -174,9 +192,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
|||||||
found_parent_id = get_folder_id(
|
found_parent_id = get_folder_id(
|
||||||
service=service, parent_id=parent_id, folder_name=folder_name
|
service=service, parent_id=parent_id, folder_name=folder_name
|
||||||
)
|
)
|
||||||
if found_parent_id is None:
|
if parent_id is None:
|
||||||
raise ValueError(f"Folder path '{path}' not found in Google Drive")
|
raise ValueError(
|
||||||
parent_id = found_parent_id
|
f"Folder '{folder_name}' in path '{path}' not found in Google Drive"
|
||||||
|
)
|
||||||
folder_ids.append(parent_id)
|
folder_ids.append(parent_id)
|
||||||
|
|
||||||
return folder_ids
|
return folder_ids
|
||||||
|
@ -3,3 +3,6 @@ plugins = sqlalchemy.ext.mypy.plugin
|
|||||||
mypy_path = $MYPY_CONFIG_FILE_DIR
|
mypy_path = $MYPY_CONFIG_FILE_DIR
|
||||||
explicit_package_bases = True
|
explicit_package_bases = True
|
||||||
disallow_untyped_defs = True
|
disallow_untyped_defs = True
|
||||||
|
|
||||||
|
[flake8]
|
||||||
|
max-line-length = 88
|
||||||
|
Loading…
x
Reference in New Issue
Block a user