From a4b47e0243e84fcfdc65c5df771d2ecf820f5a1a Mon Sep 17 00:00:00 2001
From: Sid Ravinutala <sid.ravi1@gmail.com>
Date: Fri, 21 Jul 2023 00:48:23 +0000
Subject: [PATCH] added support for docx in gdrive rebase from main

---
 .../connectors/google_drive/connector.py      | 29 +++++++++++++++----
 backend/setup.cfg                             |  3 ++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py
index 15f74c7d39..fa8ae1b517 100644
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -1,5 +1,6 @@
 import datetime
 import io
+import tempfile
 from collections.abc import Generator
 from collections.abc import Sequence
 from itertools import chain
@@ -31,8 +32,9 @@ SCOPES = [
 ]
 SUPPORTED_DRIVE_DOC_TYPES = [
     "application/vnd.google-apps.document",
-    "application/pdf",
     "application/vnd.google-apps.spreadsheet",
+    "application/pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 ]
 DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
 ID_KEY = "id"
@@ -46,7 +48,10 @@ def get_folder_id(
     """
     Get the ID of a folder given its name and the ID of its parent folder.
     """
-    query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'"
+    query = (
+        f"'{parent_id}' in parents and name='{folder_name}' and "
+        f"mimeType='{DRIVE_FOLDER_TYPE}'"
+    )
     results = (
         service.files()
         .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
@@ -140,6 +145,19 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
             .decode("utf-8")
         )
     # Default download to PDF since most types can be exported as a PDF
+    elif (
+        mime_type
+        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    ):
+        import docx2txt
+
+        response = service.files().get_media(fileId=file["id"]).execute()
+        word_stream = io.BytesIO(response)
+        with tempfile.NamedTemporaryFile(delete=False) as temp:
+            temp.write(word_stream.getvalue())
+            temp_path = temp.name
+        return docx2txt.process(temp_path)
+
     else:
         response = service.files().get_media(fileId=file["id"]).execute()
         pdf_stream = io.BytesIO(response)
@@ -174,9 +192,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
                 found_parent_id = get_folder_id(
                     service=service, parent_id=parent_id, folder_name=folder_name
                 )
-                if found_parent_id is None:
-                    raise ValueError(f"Folder path '{path}' not found in Google Drive")
-                parent_id = found_parent_id
+                if parent_id is None:
+                    raise ValueError(
+                        f"Folder '{folder_name}' in path '{path}' not found in Google Drive"
+                    )
             folder_ids.append(parent_id)
 
         return folder_ids
diff --git a/backend/setup.cfg b/backend/setup.cfg
index 7142984b8f..11e5d3a704 100644
--- a/backend/setup.cfg
+++ b/backend/setup.cfg
@@ -3,3 +3,6 @@ plugins = sqlalchemy.ext.mypy.plugin
 mypy_path = $MYPY_CONFIG_FILE_DIR
 explicit_package_bases = True
 disallow_untyped_defs = True
+
+[flake8]
+max-line-length = 88