From 81a4934bb86b22be337d8d9707aeafd9e99e49b7 Mon Sep 17 00:00:00 2001 From: Sid Ravinutala Date: Thu, 17 Aug 2023 11:54:00 -0400 Subject: [PATCH] Google drive shared files fix + shortcuts (#300) Also fixes foreign key constraint issue when manually wiping postgres + keeps track of accessed folders --- backend/danswer/configs/app_configs.py | 1 + .../connectors/google_drive/connector.py | 138 +++++++++++++----- backend/scripts/reset_postgres.py | 1 + .../google-drive/ConnectorEditPopup.tsx | 5 + .../GoogleDriveConnectorsTable.tsx | 14 ++ .../admin/connectors/google-drive/page.tsx | 10 ++ web/src/lib/types.ts | 1 + 7 files changed, 137 insertions(+), 33 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index e49c64847..f61fc9770 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -93,6 +93,7 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB", "postgres") # Connector Configs ##### GOOGLE_DRIVE_INCLUDE_SHARED = False +GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage" ) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index d7dea8c9d..dff8aa515 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -11,6 +11,7 @@ from google.oauth2.credentials import Credentials # type: ignore from googleapiclient import discovery # type: ignore from PyPDF2 import PdfReader +from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -40,9 +41,7 @@ SUPPORTED_DRIVE_DOC_TYPES = [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ] DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" -ID_KEY = "id" -LINK_KEY = "link" -TYPE_KEY = "type" +DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut" GoogleDriveFileType = dict[str, Any] @@ -51,6 +50,7 @@ def _run_drive_file_query( service: discovery.Resource, query: str, include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, + follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, batch_size: int = INDEX_BATCH_SIZE, ) -> Generator[GoogleDriveFileType, None, None]: next_page_token = "" @@ -61,7 +61,11 @@ def _run_drive_file_query( .list( pageSize=batch_size, supportsAllDrives=include_shared, - fields="nextPageToken, files(mimeType, id, name, webViewLink)", + includeItemsFromAllDrives=include_shared, + fields=( + "nextPageToken, files(mimeType, id, name, " + "webViewLink, shortcutDetails)" + ), pageToken=next_page_token, q=query, ) @@ -70,45 +74,82 @@ def _run_drive_file_query( next_page_token = results.get("nextPageToken") files = results["files"] for file in files: + if follow_shortcuts and "shortcutDetails" in file: + file = service.files().get( + fileId=file["shortcutDetails"]["targetId"], + supportsAllDrives=include_shared, + fields="mimeType, id, name, webViewLink, shortcutDetails", + ) + file = file.execute() yield file def _get_folder_id( - service: discovery.Resource, parent_id: str, folder_name: str + service: discovery.Resource, + parent_id: str, + folder_name: str, + include_shared: bool, + follow_shortcuts: bool, ) -> str | None: """ Get the ID of a folder given its name and the ID of its parent folder. """ - query = ( - f"'{parent_id}' in parents and name='{folder_name}' and " - f"mimeType='{DRIVE_FOLDER_TYPE}'" - ) + query = f"'{parent_id}' in parents and name='{folder_name}' and " + if follow_shortcuts: + query += f"(mimeType='{DRIVE_FOLDER_TYPE}' or mimeType='{DRIVE_SHORTCUT_TYPE}')" + else: + query += f"mimeType='{DRIVE_FOLDER_TYPE}'" + results = ( service.files() - .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)") + .list( + q=query, + spaces="drive", + fields="nextPageToken, files(id, name, shortcutDetails)", + supportsAllDrives=include_shared, + includeItemsFromAllDrives=include_shared, + ) .execute() ) items = results.get("files", []) - return items[0]["id"] if items else None + + folder_id = None + if items: + if follow_shortcuts and "shortcutDetails" in items[0]: + folder_id = items[0]["shortcutDetails"]["targetId"] + else: + folder_id = items[0]["id"] + return folder_id def _get_folders( service: discovery.Resource, folder_id: str | None = None, # if specified, only fetches files within this folder include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, + follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, batch_size: int = INDEX_BATCH_SIZE, ) -> Generator[GoogleDriveFileType, None, None]: query = f"mimeType = '{DRIVE_FOLDER_TYPE}' " + if follow_shortcuts: + query = "(" + query + f" or mimeType = '{DRIVE_SHORTCUT_TYPE}'" + ") " + if folder_id: query += f"and '{folder_id}' in parents " query = query.rstrip() # remove the trailing space(s) - yield from _run_drive_file_query( + for file in _run_drive_file_query( service=service, query=query, include_shared=include_shared, + follow_shortcuts=follow_shortcuts, batch_size=batch_size, - ) + ): + # Need to check this since file may have been a target of a shortcut + # and not necessarily a folder + if file["mimeType"] == DRIVE_FOLDER_TYPE: + yield file + else: + pass def _get_files( @@ -117,6 +158,7 @@ def _get_files( time_range_end: SecondsSinceUnixEpoch | None = None, folder_id: str | None = None, # if specified, only fetches files within this folder include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, + follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, supported_drive_doc_types: list[str] = SUPPORTED_DRIVE_DOC_TYPES, batch_size: int = INDEX_BATCH_SIZE, ) -> Generator[GoogleDriveFileType, None, None]: @@ -137,6 +179,7 @@ def _get_files( service=service, query=query, include_shared=include_shared, + follow_shortcuts=follow_shortcuts, batch_size=batch_size, ) for file in files: @@ -147,12 +190,15 @@ def _get_files( def get_all_files_batched( service: discovery.Resource, include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, + follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, batch_size: int = INDEX_BATCH_SIZE, time_range_start: SecondsSinceUnixEpoch | None = None, time_range_end: SecondsSinceUnixEpoch | None = None, folder_id: str | None = None, # if specified, only fetches files within this folder - # if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified. + # if True, will fetch files in sub-folders of the specified folder ID. + # Only applies if folder_id is specified. traverse_subfolders: bool = True, + folder_ids_traversed: list[str] | None = None, ) -> Generator[list[GoogleDriveFileType], None, None]: """Gets all files matching the criteria specified by the args from Google Drive in batches of size `batch_size`. @@ -163,6 +209,7 @@ def get_all_files_batched( time_range_end=time_range_end, folder_id=folder_id, include_shared=include_shared, + follow_shortcuts=follow_shortcuts, batch_size=batch_size, ) yield from batch_generator( @@ -174,23 +221,33 @@ def get_all_files_batched( ) if traverse_subfolders: + folder_ids_traversed = folder_ids_traversed or [] subfolders = _get_folders( service=service, folder_id=folder_id, include_shared=include_shared, + follow_shortcuts=follow_shortcuts, batch_size=batch_size, ) for subfolder in subfolders: - logger.info("Fetching all files in subfolder: " + subfolder["name"]) - yield from get_all_files_batched( - service=service, - include_shared=include_shared, - batch_size=batch_size, - time_range_start=time_range_start, - time_range_end=time_range_end, - folder_id=subfolder["id"], - traverse_subfolders=traverse_subfolders, - ) + if subfolder["id"] not in folder_ids_traversed: + logger.info("Fetching all files in subfolder: " + subfolder["name"]) + folder_ids_traversed.append(subfolder["id"]) + yield from get_all_files_batched( + service=service, + include_shared=include_shared, + follow_shortcuts=follow_shortcuts, + batch_size=batch_size, + time_range_start=time_range_start, + time_range_end=time_range_end, + folder_id=subfolder["id"], + traverse_subfolders=traverse_subfolders, + folder_ids_traversed=folder_ids_traversed, + ) + else: + logger.debug( + "Skipping subfolder since already traversed: " + subfolder["name"] + ) def extract_text(file: dict[str, str], service: discovery.Resource) -> str: @@ -209,7 +266,6 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: .execute() .decode("utf-8") ) - # Default download to PDF since most types can be exported as a PDF elif ( mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" @@ -220,7 +276,7 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: temp.write(word_stream.getvalue()) temp_path = temp.name return docx2txt.process(temp_path) - + # Default download to PDF since most types can be exported as a PDF else: response = service.files().get_media(fileId=file["id"]).execute() pdf_stream = io.BytesIO(response) @@ -236,15 +292,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector): folder_paths: list[str] | None = None, batch_size: int = INDEX_BATCH_SIZE, include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, + follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, ) -> None: self.folder_paths = folder_paths or [] self.batch_size = batch_size self.include_shared = include_shared + self.follow_shortcuts = follow_shortcuts self.creds: Credentials | None = None @staticmethod def _process_folder_paths( - service: discovery.Resource, folder_paths: list[str] + service: discovery.Resource, + folder_paths: list[str], + include_shared: bool, + follow_shortcuts: bool, ) -> list[str]: """['Folder/Sub Folder'] -> ['']""" folder_ids: list[str] = [] @@ -253,10 +314,19 @@ class GoogleDriveConnector(LoadConnector, PollConnector): parent_id = "root" for folder_name in folder_names: found_parent_id = _get_folder_id( - service=service, parent_id=parent_id, folder_name=folder_name + service=service, + parent_id=parent_id, + folder_name=folder_name, + include_shared=include_shared, + follow_shortcuts=follow_shortcuts, ) if found_parent_id is None: - raise ValueError(f"Folder path '{path}' not found in Google Drive") + raise ValueError( + ( + f"Folder '{folder_name}' in path '{path}' " + "not found in Google Drive" + ) + ) parent_id = found_parent_id folder_ids.append(parent_id) @@ -283,7 +353,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector): service = discovery.build("drive", "v3", credentials=self.creds) folder_ids: Sequence[str | None] = self._process_folder_paths( - service, self.folder_paths + service, self.folder_paths, self.include_shared, self.follow_shortcuts ) if not folder_ids: folder_ids = [None] @@ -293,6 +363,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector): get_all_files_batched( service=service, include_shared=self.include_shared, + follow_shortcuts=self.follow_shortcuts, batch_size=self.batch_size, time_range_start=start, time_range_end=end, @@ -326,9 +397,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector): def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - # need to subtract 10 minutes from start time to account for modifiedTime propogation - # if a document is modified, it takes some time for the API to reflect these changes - # if we do not have an offset, then we may "miss" the update when polling + # need to subtract 10 minutes from start time to account for modifiedTime + # propogation if a document is modified, it takes some time for the API to + # reflect these changes if we do not have an offset, then we may "miss" the + # update when polling yield from self._fetch_docs_from_drive( max(start - DRIVE_START_TIME_OFFSET, 0, 0), end ) diff --git a/backend/scripts/reset_postgres.py b/backend/scripts/reset_postgres.py index 19f1f0ca2..60eff9185 100644 --- a/backend/scripts/reset_postgres.py +++ b/backend/scripts/reset_postgres.py @@ -34,6 +34,7 @@ def wipe_all_rows(database: str) -> None: cur.execute(f"DELETE FROM document") cur.execute(f"DELETE FROM connector_credential_pair") cur.execute(f"DELETE FROM index_attempt") + cur.execute(f"DELETE FROM credential") conn.commit() for table_name in table_names: diff --git a/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx b/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx index cfd27ce5d..b4a84b726 100644 --- a/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx +++ b/web/src/app/admin/connectors/google-drive/ConnectorEditPopup.tsx @@ -47,6 +47,10 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => { name="include_shared" label="Include Shared Files" /> + )} validationSchema={Yup.object().shape({ @@ -58,6 +62,7 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => { ) .required(), include_shared: Yup.boolean().required(), + follow_shortcuts: Yup.boolean().required(), })} onSubmit={onSubmit} /> diff --git a/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx b/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx index 6b365f6cc..da88e0bff 100644 --- a/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx +++ b/web/src/app/admin/connectors/google-drive/GoogleDriveConnectorsTable.tsx @@ -90,6 +90,10 @@ export const GoogleDriveConnectorsTable = ({ header: "Include Shared", key: "include_shared", }, + { + header: "Follow Shortcuts", + key: "follow_shortcuts", + }, { header: "Status", key: "status", @@ -132,6 +136,16 @@ export const GoogleDriveConnectorsTable = ({ )} ), + follow_shortcuts: ( +
+ {connectorIndexingStatus.connector.connector_specific_config + .follow_shortcuts ? ( + Yes + ) : ( + No + )} +
+ ), status: ( + )} validationSchema={Yup.object().shape({ @@ -292,10 +300,12 @@ const GoogleDriveConnectorManagement = ({ ) .required(), include_shared: Yup.boolean().required(), + follow_shortcuts: Yup.boolean().required(), })} initialValues={{ folder_paths: [], include_shared: false, + follow_shortcuts: false, }} refreshFreq={10 * 60} // 10 minutes onSubmit={async (isSuccess, responseJson) => { diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 2655e601d..82d0684c7 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -57,6 +57,7 @@ export interface GithubConfig { export interface GoogleDriveConfig { folder_paths?: string[]; include_shared?: boolean; + follow_shortcuts?: boolean; } export interface BookstackConfig {}