mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-26 16:01:09 +02:00
added support for docx in gdrive
rebase from main
This commit is contained in:
parent
d6ca865034
commit
a4b47e0243
@ -1,5 +1,6 @@
|
||||
import datetime
|
||||
import io
|
||||
import tempfile
|
||||
from collections.abc import Generator
|
||||
from collections.abc import Sequence
|
||||
from itertools import chain
|
||||
@ -31,8 +32,9 @@ SCOPES = [
|
||||
]
|
||||
SUPPORTED_DRIVE_DOC_TYPES = [
|
||||
"application/vnd.google-apps.document",
|
||||
"application/pdf",
|
||||
"application/vnd.google-apps.spreadsheet",
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
|
||||
ID_KEY = "id"
|
||||
@ -46,7 +48,10 @@ def get_folder_id(
|
||||
"""
|
||||
Get the ID of a folder given its name and the ID of its parent folder.
|
||||
"""
|
||||
query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'"
|
||||
query = (
|
||||
f"'{parent_id}' in parents and name='{folder_name}' and "
|
||||
f"mimeType='{DRIVE_FOLDER_TYPE}'"
|
||||
)
|
||||
results = (
|
||||
service.files()
|
||||
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
|
||||
@ -140,6 +145,19 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
.decode("utf-8")
|
||||
)
|
||||
# Default download to PDF since most types can be exported as a PDF
|
||||
elif (
|
||||
mime_type
|
||||
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
import docx2txt
|
||||
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
word_stream = io.BytesIO(response)
|
||||
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
||||
temp.write(word_stream.getvalue())
|
||||
temp_path = temp.name
|
||||
return docx2txt.process(temp_path)
|
||||
|
||||
else:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
pdf_stream = io.BytesIO(response)
|
||||
@ -174,9 +192,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
found_parent_id = get_folder_id(
|
||||
service=service, parent_id=parent_id, folder_name=folder_name
|
||||
)
|
||||
if found_parent_id is None:
|
||||
raise ValueError(f"Folder path '{path}' not found in Google Drive")
|
||||
parent_id = found_parent_id
|
||||
if parent_id is None:
|
||||
raise ValueError(
|
||||
f"Folder '{folder_name}' in path '{path}' not found in Google Drive"
|
||||
)
|
||||
folder_ids.append(parent_id)
|
||||
|
||||
return folder_ids
|
||||
|
@ -3,3 +3,6 @@ plugins = sqlalchemy.ext.mypy.plugin
|
||||
mypy_path = $MYPY_CONFIG_FILE_DIR
|
||||
explicit_package_bases = True
|
||||
disallow_untyped_defs = True
|
||||
|
||||
[flake8]
|
||||
max-line-length = 88
|
||||
|
Loading…
x
Reference in New Issue
Block a user