mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-05 20:49:48 +02:00
extra logging for uncommon permissions cases (#4532)
* extra logging for uncommon permissions cases * address CW comments
This commit is contained in:
parent
ae9f8c3071
commit
a8cba7abae
@ -43,7 +43,6 @@ from onyx.connectors.google_utils.google_utils import execute_paginated_retrieva
|
||||
from onyx.connectors.google_utils.google_utils import GoogleFields
|
||||
from onyx.connectors.google_utils.resources import get_admin_service
|
||||
from onyx.connectors.google_utils.resources import get_drive_service
|
||||
from onyx.connectors.google_utils.resources import get_google_docs_service
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
@ -62,7 +61,6 @@ from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import EntityFailure
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.lazy import lazy_eval
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
from onyx.utils.threadpool_concurrency import parallel_yield
|
||||
@ -86,36 +84,6 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
|
||||
return [urlparse(url).path.strip("/").split("/")[-1] for url in urls]
|
||||
|
||||
|
||||
def _convert_single_file(
|
||||
creds: Any,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
retriever_email: str,
|
||||
file: dict[str, Any],
|
||||
) -> Document | ConnectorFailure | None:
|
||||
# We used to always get the user email from the file owners when available,
|
||||
# but this was causing issues with shared folders where the owner was not included in the service account
|
||||
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
|
||||
# wanting to retry with file owners and/or admin email at some point.
|
||||
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||
|
||||
user_email = retriever_email
|
||||
# Only construct these services when needed
|
||||
user_drive_service = lazy_eval(
|
||||
lambda: get_drive_service(creds, user_email=user_email)
|
||||
)
|
||||
docs_service = lazy_eval(
|
||||
lambda: get_google_docs_service(creds, user_email=user_email)
|
||||
)
|
||||
return convert_drive_item_to_document(
|
||||
file=file,
|
||||
drive_service=user_drive_service,
|
||||
docs_service=docs_service,
|
||||
allow_images=allow_images,
|
||||
size_threshold=size_threshold,
|
||||
)
|
||||
|
||||
|
||||
def _clean_requested_drive_ids(
|
||||
requested_drive_ids: set[str],
|
||||
requested_folder_ids: set[str],
|
||||
@ -963,7 +931,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
||||
try:
|
||||
# Prepare a partial function with the credentials and admin email
|
||||
convert_func = partial(
|
||||
_convert_single_file,
|
||||
convert_drive_item_to_document,
|
||||
self.creds,
|
||||
self.allow_images,
|
||||
self.size_threshold,
|
||||
|
@ -1,8 +1,9 @@
|
||||
import io
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from googleapiclient.http import MediaIoBaseDownload # type: ignore
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@ -13,7 +14,8 @@ from onyx.connectors.google_drive.models import GDriveMimeType
|
||||
from onyx.connectors.google_drive.models import GoogleDriveFileType
|
||||
from onyx.connectors.google_drive.section_extraction import get_document_sections
|
||||
from onyx.connectors.google_drive.section_extraction import HEADING_DELIMITER
|
||||
from onyx.connectors.google_utils.resources import GoogleDocsService
|
||||
from onyx.connectors.google_utils.resources import get_drive_service
|
||||
from onyx.connectors.google_utils.resources import get_google_docs_service
|
||||
from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
@ -39,6 +41,7 @@ logger = setup_logger()
|
||||
# This is not a standard valid unicode char, it is used by the docs advanced API to
|
||||
# represent smart chips (elements like dates and doc links).
|
||||
SMART_CHIP_CHAR = "\ue907"
|
||||
WEB_VIEW_LINK_KEY = "webViewLink"
|
||||
|
||||
# Mapping of Google Drive mime types to export formats
|
||||
GOOGLE_MIME_TYPES_TO_EXPORT = {
|
||||
@ -111,7 +114,7 @@ def _download_and_extract_sections_basic(
|
||||
file_id = file["id"]
|
||||
file_name = file["name"]
|
||||
mime_type = file["mimeType"]
|
||||
link = file.get("webViewLink", "")
|
||||
link = file.get(WEB_VIEW_LINK_KEY, "")
|
||||
|
||||
# skip images if not explicitly enabled
|
||||
if not allow_images and is_gdrive_image_mime_type(mime_type):
|
||||
@ -294,18 +297,30 @@ def align_basic_advanced(
|
||||
return new_sections
|
||||
|
||||
|
||||
# We used to always get the user email from the file owners when available,
|
||||
# but this was causing issues with shared folders where the owner was not included in the service account
|
||||
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
|
||||
# wanting to retry with file owners and/or admin email at some point.
|
||||
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||
def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: Callable[[], GoogleDriveService],
|
||||
docs_service: Callable[[], GoogleDocsService],
|
||||
creds: Any,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
retriever_email: str,
|
||||
file: GoogleDriveFileType,
|
||||
) -> Document | ConnectorFailure | None:
|
||||
"""
|
||||
Main entry point for converting a Google Drive file => Document object.
|
||||
"""
|
||||
doc_id = ""
|
||||
doc_id = file.get(WEB_VIEW_LINK_KEY, "")
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
# Only construct these services when needed
|
||||
drive_service = lazy_eval(
|
||||
lambda: get_drive_service(creds, user_email=retriever_email)
|
||||
)
|
||||
docs_service = lazy_eval(
|
||||
lambda: get_google_docs_service(creds, user_email=retriever_email)
|
||||
)
|
||||
|
||||
try:
|
||||
# skip shortcuts or folders
|
||||
@ -358,7 +373,7 @@ def convert_drive_item_to_document(
|
||||
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
|
||||
return None
|
||||
|
||||
doc_id = file["webViewLink"]
|
||||
doc_id = file[WEB_VIEW_LINK_KEY]
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
@ -376,8 +391,16 @@ def convert_drive_item_to_document(
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
error_str = f"Error converting file '{file.get('name')}' to Document: {e}"
|
||||
logger.exception(error_str)
|
||||
file_name = file.get("name")
|
||||
error_str = f"Error converting file '{file_name}' to Document: {e}"
|
||||
if isinstance(e, HttpError) and e.status_code == 403:
|
||||
logger.warning(
|
||||
f"Uncommon permissions error while downloading file. User "
|
||||
f"{retriever_email} was able to see file {file_name} "
|
||||
"but cannot download it."
|
||||
)
|
||||
logger.warning(error_str)
|
||||
|
||||
return ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=doc_id,
|
||||
@ -395,7 +418,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
|
||||
return None
|
||||
return SlimDocument(
|
||||
id=file["webViewLink"],
|
||||
id=file[WEB_VIEW_LINK_KEY],
|
||||
perm_sync_data={
|
||||
"doc_id": file.get("id"),
|
||||
"drive_id": file.get("driveId"),
|
||||
|
Loading…
x
Reference in New Issue
Block a user