diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index 9a8db3a992..d74cda80c4 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -43,7 +43,6 @@ from onyx.connectors.google_utils.google_utils import execute_paginated_retrieva from onyx.connectors.google_utils.google_utils import GoogleFields from onyx.connectors.google_utils.resources import get_admin_service from onyx.connectors.google_utils.resources import get_drive_service -from onyx.connectors.google_utils.resources import get_google_docs_service from onyx.connectors.google_utils.resources import GoogleDriveService from onyx.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_PRIMARY_ADMIN_KEY, @@ -62,7 +61,6 @@ from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document from onyx.connectors.models import EntityFailure from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface -from onyx.utils.lazy import lazy_eval from onyx.utils.logger import setup_logger from onyx.utils.retry_wrapper import retry_builder from onyx.utils.threadpool_concurrency import parallel_yield @@ -86,36 +84,6 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]: return [urlparse(url).path.strip("/").split("/")[-1] for url in urls] -def _convert_single_file( - creds: Any, - allow_images: bool, - size_threshold: int, - retriever_email: str, - file: dict[str, Any], -) -> Document | ConnectorFailure | None: - # We used to always get the user email from the file owners when available, - # but this was causing issues with shared folders where the owner was not included in the service account - # now we use the email of the account that successfully listed the file. Leaving this in case we end up - # wanting to retry with file owners and/or admin email at some point. - # user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email - - user_email = retriever_email - # Only construct these services when needed - user_drive_service = lazy_eval( - lambda: get_drive_service(creds, user_email=user_email) - ) - docs_service = lazy_eval( - lambda: get_google_docs_service(creds, user_email=user_email) - ) - return convert_drive_item_to_document( - file=file, - drive_service=user_drive_service, - docs_service=docs_service, - allow_images=allow_images, - size_threshold=size_threshold, - ) - - def _clean_requested_drive_ids( requested_drive_ids: set[str], requested_folder_ids: set[str], @@ -963,7 +931,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck try: # Prepare a partial function with the credentials and admin email convert_func = partial( - _convert_single_file, + convert_drive_item_to_document, self.creds, self.allow_images, self.size_threshold, diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 78cf6a51f4..246a08a5d7 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -1,8 +1,9 @@ import io -from collections.abc import Callable from datetime import datetime +from typing import Any from typing import cast +from googleapiclient.errors import HttpError # type: ignore from googleapiclient.http import MediaIoBaseDownload # type: ignore from onyx.configs.constants import DocumentSource @@ -13,7 +14,8 @@ from onyx.connectors.google_drive.models import GDriveMimeType from onyx.connectors.google_drive.models import GoogleDriveFileType from onyx.connectors.google_drive.section_extraction import get_document_sections from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER -from onyx.connectors.google_utils.resources import GoogleDocsService +from onyx.connectors.google_utils.resources import get_drive_service +from onyx.connectors.google_utils.resources import get_google_docs_service from onyx.connectors.google_utils.resources import GoogleDriveService from onyx.connectors.models import ConnectorFailure from onyx.connectors.models import Document @@ -39,6 +41,7 @@ logger = setup_logger() # This is not a standard valid unicode char, it is used by the docs advanced API to # represent smart chips (elements like dates and doc links). SMART_CHIP_CHAR = "\ue907" +WEB_VIEW_LINK_KEY = "webViewLink" # Mapping of Google Drive mime types to export formats GOOGLE_MIME_TYPES_TO_EXPORT = { @@ -111,7 +114,7 @@ def _download_and_extract_sections_basic( file_id = file["id"] file_name = file["name"] mime_type = file["mimeType"] - link = file.get("webViewLink", "") + link = file.get(WEB_VIEW_LINK_KEY, "") # skip images if not explicitly enabled if not allow_images and is_gdrive_image_mime_type(mime_type): @@ -294,18 +297,30 @@ def align_basic_advanced( return new_sections +# We used to always get the user email from the file owners when available, +# but this was causing issues with shared folders where the owner was not included in the service account +# now we use the email of the account that successfully listed the file. Leaving this in case we end up +# wanting to retry with file owners and/or admin email at some point. +# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email def convert_drive_item_to_document( - file: GoogleDriveFileType, - drive_service: Callable[[], GoogleDriveService], - docs_service: Callable[[], GoogleDocsService], + creds: Any, allow_images: bool, size_threshold: int, + retriever_email: str, + file: GoogleDriveFileType, ) -> Document | ConnectorFailure | None: """ Main entry point for converting a Google Drive file => Document object. """ - doc_id = "" + doc_id = file.get(WEB_VIEW_LINK_KEY, "") sections: list[TextSection | ImageSection] = [] + # Only construct these services when needed + drive_service = lazy_eval( + lambda: get_drive_service(creds, user_email=retriever_email) + ) + docs_service = lazy_eval( + lambda: get_google_docs_service(creds, user_email=retriever_email) + ) try: # skip shortcuts or folders @@ -358,7 +373,7 @@ def convert_drive_item_to_document( logger.warning(f"No content extracted from {file.get('name')}. Skipping.") return None - doc_id = file["webViewLink"] + doc_id = file[WEB_VIEW_LINK_KEY] # Create the document return Document( @@ -376,8 +391,16 @@ def convert_drive_item_to_document( ), ) except Exception as e: - error_str = f"Error converting file '{file.get('name')}' to Document: {e}" - logger.exception(error_str) + file_name = file.get("name") + error_str = f"Error converting file '{file_name}' to Document: {e}" + if isinstance(e, HttpError) and e.status_code == 403: + logger.warning( + f"Uncommon permissions error while downloading file. User " + f"{retriever_email} was able to see file {file_name} " + "but cannot download it." + ) + logger.warning(error_str) + return ConnectorFailure( failed_document=DocumentFailure( document_id=doc_id, @@ -395,7 +418,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]: return None return SlimDocument( - id=file["webViewLink"], + id=file[WEB_VIEW_LINK_KEY], perm_sync_data={ "doc_id": file.get("id"), "drive_id": file.get("driveId"),