extra logging for uncommon permissions cases (#4532)

* extra logging for uncommon permissions cases

* address CW comments
This commit is contained in:
evan-danswer 2025-04-15 11:56:17 -07:00 committed by GitHub
parent ae9f8c3071
commit a8cba7abae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 35 additions and 44 deletions

View File

@ -43,7 +43,6 @@ from onyx.connectors.google_utils.google_utils import execute_paginated_retrieva
from onyx.connectors.google_utils.google_utils import GoogleFields
from onyx.connectors.google_utils.resources import get_admin_service
from onyx.connectors.google_utils.resources import get_drive_service
from onyx.connectors.google_utils.resources import get_google_docs_service
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.google_utils.shared_constants import (
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
@ -62,7 +61,6 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import EntityFailure
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.lazy import lazy_eval
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
from onyx.utils.threadpool_concurrency import parallel_yield
@ -86,36 +84,6 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
return [urlparse(url).path.strip("/").split("/")[-1] for url in urls]
def _convert_single_file(
creds: Any,
allow_images: bool,
size_threshold: int,
retriever_email: str,
file: dict[str, Any],
) -> Document | ConnectorFailure | None:
# We used to always get the user email from the file owners when available,
# but this was causing issues with shared folders where the owner was not included in the service account
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
# wanting to retry with file owners and/or admin email at some point.
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_email = retriever_email
# Only construct these services when needed
user_drive_service = lazy_eval(
lambda: get_drive_service(creds, user_email=user_email)
)
docs_service = lazy_eval(
lambda: get_google_docs_service(creds, user_email=user_email)
)
return convert_drive_item_to_document(
file=file,
drive_service=user_drive_service,
docs_service=docs_service,
allow_images=allow_images,
size_threshold=size_threshold,
)
def _clean_requested_drive_ids(
requested_drive_ids: set[str],
requested_folder_ids: set[str],
@ -963,7 +931,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
try:
# Prepare a partial function with the credentials and admin email
convert_func = partial(
_convert_single_file,
convert_drive_item_to_document,
self.creds,
self.allow_images,
self.size_threshold,

View File

@ -1,8 +1,9 @@
import io
from collections.abc import Callable
from datetime import datetime
from typing import Any
from typing import cast
from googleapiclient.errors import HttpError # type: ignore
from googleapiclient.http import MediaIoBaseDownload # type: ignore
from onyx.configs.constants import DocumentSource
@ -13,7 +14,8 @@ from onyx.connectors.google_drive.models import GDriveMimeType
from onyx.connectors.google_drive.models import GoogleDriveFileType
from onyx.connectors.google_drive.section_extraction import get_document_sections
from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
from onyx.connectors.google_utils.resources import GoogleDocsService
from onyx.connectors.google_utils.resources import get_drive_service
from onyx.connectors.google_utils.resources import get_google_docs_service
from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
@ -39,6 +41,7 @@ logger = setup_logger()
# This is not a standard valid unicode char, it is used by the docs advanced API to
# represent smart chips (elements like dates and doc links).
SMART_CHIP_CHAR = "\ue907"
WEB_VIEW_LINK_KEY = "webViewLink"
# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
@ -111,7 +114,7 @@ def _download_and_extract_sections_basic(
file_id = file["id"]
file_name = file["name"]
mime_type = file["mimeType"]
link = file.get("webViewLink", "")
link = file.get(WEB_VIEW_LINK_KEY, "")
# skip images if not explicitly enabled
if not allow_images and is_gdrive_image_mime_type(mime_type):
@ -294,18 +297,30 @@ def align_basic_advanced(
return new_sections
# We used to always get the user email from the file owners when available,
# but this was causing issues with shared folders where the owner was not included in the service account
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
# wanting to retry with file owners and/or admin email at some point.
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
def convert_drive_item_to_document(
file: GoogleDriveFileType,
drive_service: Callable[[], GoogleDriveService],
docs_service: Callable[[], GoogleDocsService],
creds: Any,
allow_images: bool,
size_threshold: int,
retriever_email: str,
file: GoogleDriveFileType,
) -> Document | ConnectorFailure | None:
"""
Main entry point for converting a Google Drive file => Document object.
"""
doc_id = ""
doc_id = file.get(WEB_VIEW_LINK_KEY, "")
sections: list[TextSection | ImageSection] = []
# Only construct these services when needed
drive_service = lazy_eval(
lambda: get_drive_service(creds, user_email=retriever_email)
)
docs_service = lazy_eval(
lambda: get_google_docs_service(creds, user_email=retriever_email)
)
try:
# skip shortcuts or folders
@ -358,7 +373,7 @@ def convert_drive_item_to_document(
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
return None
doc_id = file["webViewLink"]
doc_id = file[WEB_VIEW_LINK_KEY]
# Create the document
return Document(
@ -376,8 +391,16 @@ def convert_drive_item_to_document(
),
)
except Exception as e:
error_str = f"Error converting file '{file.get('name')}' to Document: {e}"
logger.exception(error_str)
file_name = file.get("name")
error_str = f"Error converting file '{file_name}' to Document: {e}"
if isinstance(e, HttpError) and e.status_code == 403:
logger.warning(
f"Uncommon permissions error while downloading file. User "
f"{retriever_email} was able to see file {file_name} "
"but cannot download it."
)
logger.warning(error_str)
return ConnectorFailure(
failed_document=DocumentFailure(
document_id=doc_id,
@ -395,7 +418,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
return None
return SlimDocument(
id=file["webViewLink"],
id=file[WEB_VIEW_LINK_KEY],
perm_sync_data={
"doc_id": file.get("id"),
"drive_id": file.get("driveId"),