added support for docx in gdrive

rebase from main
This commit is contained in:
Sid Ravinutala 2023-07-21 00:48:23 +00:00
parent d6ca865034
commit a4b47e0243
2 changed files with 27 additions and 5 deletions

View File

@ -1,5 +1,6 @@
import datetime import datetime
import io import io
import tempfile
from collections.abc import Generator from collections.abc import Generator
from collections.abc import Sequence from collections.abc import Sequence
from itertools import chain from itertools import chain
@ -31,8 +32,9 @@ SCOPES = [
] ]
SUPPORTED_DRIVE_DOC_TYPES = [ SUPPORTED_DRIVE_DOC_TYPES = [
"application/vnd.google-apps.document", "application/vnd.google-apps.document",
"application/pdf",
"application/vnd.google-apps.spreadsheet", "application/vnd.google-apps.spreadsheet",
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
] ]
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
ID_KEY = "id" ID_KEY = "id"
@ -46,7 +48,10 @@ def get_folder_id(
""" """
Get the ID of a folder given its name and the ID of its parent folder. Get the ID of a folder given its name and the ID of its parent folder.
""" """
query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'" query = (
f"'{parent_id}' in parents and name='{folder_name}' and "
f"mimeType='{DRIVE_FOLDER_TYPE}'"
)
results = ( results = (
service.files() service.files()
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)") .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
@ -140,6 +145,19 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
.decode("utf-8") .decode("utf-8")
) )
# Default download to PDF since most types can be exported as a PDF # Default download to PDF since most types can be exported as a PDF
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
import docx2txt
response = service.files().get_media(fileId=file["id"]).execute()
word_stream = io.BytesIO(response)
with tempfile.NamedTemporaryFile(delete=False) as temp:
temp.write(word_stream.getvalue())
temp_path = temp.name
return docx2txt.process(temp_path)
else: else:
response = service.files().get_media(fileId=file["id"]).execute() response = service.files().get_media(fileId=file["id"]).execute()
pdf_stream = io.BytesIO(response) pdf_stream = io.BytesIO(response)
@ -174,9 +192,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
found_parent_id = get_folder_id( found_parent_id = get_folder_id(
service=service, parent_id=parent_id, folder_name=folder_name service=service, parent_id=parent_id, folder_name=folder_name
) )
if found_parent_id is None: if parent_id is None:
raise ValueError(f"Folder path '{path}' not found in Google Drive") raise ValueError(
parent_id = found_parent_id f"Folder '{folder_name}' in path '{path}' not found in Google Drive"
)
folder_ids.append(parent_id) folder_ids.append(parent_id)
return folder_ids return folder_ids

View File

@ -3,3 +3,6 @@ plugins = sqlalchemy.ext.mypy.plugin
mypy_path = $MYPY_CONFIG_FILE_DIR mypy_path = $MYPY_CONFIG_FILE_DIR
explicit_package_bases = True explicit_package_bases = True
disallow_untyped_defs = True disallow_untyped_defs = True
[flake8]
max-line-length = 88