From a4b47e0243e84fcfdc65c5df771d2ecf820f5a1a Mon Sep 17 00:00:00 2001 From: Sid Ravinutala Date: Fri, 21 Jul 2023 00:48:23 +0000 Subject: [PATCH] added support for docx in gdrive rebase from main --- .../connectors/google_drive/connector.py | 29 +++++++++++++++---- backend/setup.cfg | 3 ++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 15f74c7d39..fa8ae1b517 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,5 +1,6 @@ import datetime import io +import tempfile from collections.abc import Generator from collections.abc import Sequence from itertools import chain @@ -31,8 +32,9 @@ SCOPES = [ ] SUPPORTED_DRIVE_DOC_TYPES = [ "application/vnd.google-apps.document", - "application/pdf", "application/vnd.google-apps.spreadsheet", + "application/pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" ID_KEY = "id" @@ -46,7 +48,10 @@ def get_folder_id( """ Get the ID of a folder given its name and the ID of its parent folder. """ - query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'" + query = ( + f"'{parent_id}' in parents and name='{folder_name}' and " + f"mimeType='{DRIVE_FOLDER_TYPE}'" + ) results = ( service.files() .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)") @@ -140,6 +145,19 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: .decode("utf-8") ) # Default download to PDF since most types can be exported as a PDF + elif ( + mime_type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + import docx2txt + + response = service.files().get_media(fileId=file["id"]).execute() + word_stream = io.BytesIO(response) + with tempfile.NamedTemporaryFile(delete=False) as temp: + temp.write(word_stream.getvalue()) + temp_path = temp.name + return docx2txt.process(temp_path) + else: response = service.files().get_media(fileId=file["id"]).execute() pdf_stream = io.BytesIO(response) @@ -174,9 +192,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector): found_parent_id = get_folder_id( service=service, parent_id=parent_id, folder_name=folder_name ) - if found_parent_id is None: - raise ValueError(f"Folder path '{path}' not found in Google Drive") - parent_id = found_parent_id + if parent_id is None: + raise ValueError( + f"Folder '{folder_name}' in path '{path}' not found in Google Drive" + ) folder_ids.append(parent_id) return folder_ids diff --git a/backend/setup.cfg b/backend/setup.cfg index 7142984b8f..11e5d3a704 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -3,3 +3,6 @@ plugins = sqlalchemy.ext.mypy.plugin mypy_path = $MYPY_CONFIG_FILE_DIR explicit_package_bases = True disallow_untyped_defs = True + +[flake8] +max-line-length = 88