drive perm sync logs + misc deployment improvements (#4788)

* some logs * give postgress more memory * give postgress more memory * give postgress more memory * revert * give postgress more memory * bump external access limit * vespa timeout * deployment consistency * bump vespa version * skip upgrade check * retry permission by ids * logs * fix temp docx file issue * fix drive file deduping * RK comments * mypy * aggregate logs
2025-07-12 14:12:53 +02:00 · 2025-06-01 19:36:57 -04:00
parent b19e3a500b
commit 4c71a5f5ff
15 changed files with 44 additions and 10 deletions
--- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
@ -200,7 +200,9 @@ def gdrive_doc_sync(

    slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)

+    total_processed = 0
    for slim_doc_batch in slim_doc_generator:
+        logger.info(f"Drive perm sync: Processing {len(slim_doc_batch)} documents")
        for slim_doc in slim_doc_batch:
            if callback:
                if callback.should_stop():
@ -216,3 +218,5 @@ def gdrive_doc_sync(
                external_access=ext_access,
                doc_id=slim_doc.id,
            )
+        total_processed += len(slim_doc_batch)
+        logger.info(f"Drive perm sync: Processed {total_processed} total documents")
--- a/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
+++ b/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
@ -1,3 +1,5 @@
+from retry import retry
+
 from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
 from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
 from onyx.connectors.google_utils.resources import RefreshableDriveObject
@ -6,6 +8,7 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+@retry(tries=3, delay=2, backoff=2)
 def get_permissions_by_ids(
    drive_service: RefreshableDriveObject,
    doc_id: str,
--- a/backend/onyx/access/models.py
+++ b/backend/onyx/access/models.py
@ -11,7 +11,7 @@ class ExternalAccess:

    # arbitrary limit to prevent excessively large permissions sets
    # not internally enforced ... the caller can check this before using the instance
-    MAX_NUM_ENTRIES = 1000
+    MAX_NUM_ENTRIES = 5000

    # Emails of external users with access to the doc externally
    external_user_emails: set[str]
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@ -27,6 +27,7 @@ from onyx.connectors.google_drive.doc_conversion import build_slim_document
 from onyx.connectors.google_drive.doc_conversion import (
    convert_drive_item_to_document,
 )
+from onyx.connectors.google_drive.doc_conversion import onyx_document_id_from_drive_file
 from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
 from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
 from onyx.connectors.google_drive.file_retrieval import (
@ -922,8 +923,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
                ).timestamp(),
                current_folder_or_drive_id=file.parent_id,
            )
-            if file.drive_file["id"] not in checkpoint.all_retrieved_file_ids:
-                checkpoint.all_retrieved_file_ids.add(file.drive_file["id"])
+            document_id = onyx_document_id_from_drive_file(file.drive_file)
+            if document_id not in checkpoint.all_retrieved_file_ids:
+                checkpoint.all_retrieved_file_ids.add(document_id)
                yield file

    def _manage_oauth_retrieval(
@ -1138,6 +1140,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
                raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
            raise e
        checkpoint.retrieved_folder_and_drive_ids = self._retrieved_folder_and_drive_ids
+
+        logger.info(
+            f"num drive files retrieved: {len(checkpoint.all_retrieved_file_ids)}"
+        )
        if checkpoint.completion_stage == DriveRetrievalStage.DONE:
            checkpoint.has_more = False
        return checkpoint
@ -1186,6 +1192,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
                    end=end,
                    callback=callback,
                )
+            logger.info("Drive perm sync: Slim doc retrieval complete")

        except Exception as e:
            if MISSING_SCOPES_ERROR_STR in str(e):
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@ -62,6 +62,10 @@ GOOGLE_MIME_TYPES = {
 }


+def onyx_document_id_from_drive_file(file: GoogleDriveFileType) -> str:
+    return file[WEB_VIEW_LINK_KEY]
+
+
 def _summarize_drive_image(
    image_data: bytes, image_name: str, image_analysis_llm: LLM | None
 ) -> str:
@ -380,7 +384,6 @@ def _convert_drive_item_to_document(
    """
    Main entry point for converting a Google Drive file => Document object.
    """
-    doc_id = file.get(WEB_VIEW_LINK_KEY, "")
    sections: list[TextSection | ImageSection] = []
    # Only construct these services when needed
    drive_service = lazy_eval(
@ -389,6 +392,7 @@ def _convert_drive_item_to_document(
    docs_service = lazy_eval(
        lambda: get_google_docs_service(creds, user_email=retriever_email)
    )
+    doc_id = "unknown"

    try:
        # skip shortcuts or folders
@ -441,7 +445,7 @@ def _convert_drive_item_to_document(
            logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
            return None

-        doc_id = file[WEB_VIEW_LINK_KEY]
+        doc_id = onyx_document_id_from_drive_file(file)

        # Create the document
        return Document(
@ -488,7 +492,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
    if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
        return None
    return SlimDocument(
-        id=file[WEB_VIEW_LINK_KEY],
+        id=onyx_document_id_from_drive_file(file),
        perm_sync_data={
            "doc_id": file.get("id"),
            "drive_id": file.get("driveId"),
--- a/backend/onyx/document_index/vespa_constants.py
+++ b/backend/onyx/document_index/vespa_constants.py
@ -36,7 +36,7 @@ MAX_OR_CONDITIONS = 10
 # up from 500ms for now, since we've seen quite a few timeouts
 # in the long term, we are looking to improve the performance of Vespa
 # so that we can bring this back to default
-VESPA_TIMEOUT = "3s"
+VESPA_TIMEOUT = "10s"
 BATCH_SIZE = 128  # Specific to Vespa

 TENANT_ID = "tenant_id"
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@ -301,7 +301,7 @@ def read_pdf_file(


 def docx_to_text_and_images(
-    file: IO[Any],
+    file: IO[Any], file_name: str = ""
 ) -> tuple[str, Sequence[tuple[bytes, str]]]:
    """
    Extract text from a docx. If embed_images=True, also extract inline images.
@ -310,7 +310,11 @@ def docx_to_text_and_images(
    paragraphs = []
    embedded_images: list[tuple[bytes, str]] = []

-    doc = docx.Document(file)
+    try:
+        doc = docx.Document(file)
+    except BadZipFile as e:
+        logger.warning(f"Failed to extract text from {file_name or 'docx file'}: {e}")
+        return "", []

    # Grab text from paragraphs
    for paragraph in doc.paragraphs:
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@ -378,6 +378,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    environment:
--- a/deployment/docker_compose/docker-compose.gpu-dev.yml
+++ b/deployment/docker_compose/docker-compose.gpu-dev.yml
@ -324,6 +324,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    environment:
--- a/deployment/docker_compose/docker-compose.multitenant-dev.yml
+++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml
@ -351,6 +351,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    environment:
--- a/deployment/docker_compose/docker-compose.prod-cloud.yml
+++ b/deployment/docker_compose/docker-compose.prod-cloud.yml
@ -88,6 +88,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
--- a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml
+++ b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml
@ -166,6 +166,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
--- a/deployment/docker_compose/docker-compose.prod.yml
+++ b/deployment/docker_compose/docker-compose.prod.yml
@ -120,6 +120,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
@ -193,8 +194,10 @@ services:

  # This container name cannot have an underscore in it due to Vespa expectations of the URL
  index:
-    image: vespaengine/vespa:8.277.17
+    image: vespaengine/vespa:8.524.25
    restart: always
+    environment:
+      - VESPA_SKIP_UPGRADE_CHECK=true
    ports:
      - "19071:19071"
      - "8081:8081"
--- a/deployment/docker_compose/docker-compose.search-testing.yml
+++ b/deployment/docker_compose/docker-compose.search-testing.yml
@ -148,6 +148,7 @@ services:

  relational_db:
    image: postgres:15.2-alpine
+    shm_size: 1g
    command: -c 'max_connections=250'
    restart: always
    environment:
--- a/deployment/helm/charts/onyx/values.yaml
+++ b/deployment/helm/charts/onyx/values.yaml
@ -7,6 +7,9 @@ postgresql:
    persistence:
      storageClass: ""
      size: 5Gi
+    shmVolume:
+      enabled: true
+      sizeLimit: 1Gi
  enabled: true
  auth:
    existingSecret: onyx-secrets