From 4c71a5f5ff013ed87bf3422b2b8a87b46f024a0a Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Sun, 1 Jun 2025 19:36:57 -0400 Subject: [PATCH] drive perm sync logs + misc deployment improvements (#4788) * some logs * give postgress more memory * give postgress more memory * give postgress more memory * revert * give postgress more memory * bump external access limit * vespa timeout * deployment consistency * bump vespa version * skip upgrade check * retry permission by ids * logs * fix temp docx file issue * fix drive file deduping * RK comments * mypy * aggregate logs --- .../external_permissions/google_drive/doc_sync.py | 4 ++++ .../google_drive/permission_retrieval.py | 3 +++ backend/onyx/access/models.py | 2 +- backend/onyx/connectors/google_drive/connector.py | 11 +++++++++-- .../onyx/connectors/google_drive/doc_conversion.py | 10 +++++++--- backend/onyx/document_index/vespa_constants.py | 2 +- backend/onyx/file_processing/extract_file_text.py | 8 ++++++-- deployment/docker_compose/docker-compose.dev.yml | 1 + deployment/docker_compose/docker-compose.gpu-dev.yml | 1 + .../docker_compose/docker-compose.multitenant-dev.yml | 1 + .../docker_compose/docker-compose.prod-cloud.yml | 1 + .../docker-compose.prod-no-letsencrypt.yml | 1 + deployment/docker_compose/docker-compose.prod.yml | 5 ++++- .../docker_compose/docker-compose.search-testing.yml | 1 + deployment/helm/charts/onyx/values.yaml | 3 +++ 15 files changed, 44 insertions(+), 10 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py index 34a309c32b7..707986bd97d 100644 --- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py @@ -200,7 +200,9 @@ def gdrive_doc_sync( slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector) + total_processed = 0 for slim_doc_batch in slim_doc_generator: + logger.info(f"Drive perm sync: Processing {len(slim_doc_batch)} documents") for slim_doc in slim_doc_batch: if callback: if callback.should_stop(): @@ -216,3 +218,5 @@ def gdrive_doc_sync( external_access=ext_access, doc_id=slim_doc.id, ) + total_processed += len(slim_doc_batch) + logger.info(f"Drive perm sync: Processed {total_processed} total documents") diff --git a/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py b/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py index 7d3057d75bd..f33006ceee7 100644 --- a/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py +++ b/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py @@ -1,3 +1,5 @@ +from retry import retry + from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval from onyx.connectors.google_utils.resources import RefreshableDriveObject @@ -6,6 +8,7 @@ from onyx.utils.logger import setup_logger logger = setup_logger() +@retry(tries=3, delay=2, backoff=2) def get_permissions_by_ids( drive_service: RefreshableDriveObject, doc_id: str, diff --git a/backend/onyx/access/models.py b/backend/onyx/access/models.py index 4b542aae0f6..d7291fcf67d 100644 --- a/backend/onyx/access/models.py +++ b/backend/onyx/access/models.py @@ -11,7 +11,7 @@ class ExternalAccess: # arbitrary limit to prevent excessively large permissions sets # not internally enforced ... the caller can check this before using the instance - MAX_NUM_ENTRIES = 1000 + MAX_NUM_ENTRIES = 5000 # Emails of external users with access to the doc externally external_user_emails: set[str] diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index ab19c032c4a..5b6de2b1dba 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -27,6 +27,7 @@ from onyx.connectors.google_drive.doc_conversion import build_slim_document from onyx.connectors.google_drive.doc_conversion import ( convert_drive_item_to_document, ) +from onyx.connectors.google_drive.doc_conversion import onyx_document_id_from_drive_file from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth from onyx.connectors.google_drive.file_retrieval import ( @@ -922,8 +923,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck ).timestamp(), current_folder_or_drive_id=file.parent_id, ) - if file.drive_file["id"] not in checkpoint.all_retrieved_file_ids: - checkpoint.all_retrieved_file_ids.add(file.drive_file["id"]) + document_id = onyx_document_id_from_drive_file(file.drive_file) + if document_id not in checkpoint.all_retrieved_file_ids: + checkpoint.all_retrieved_file_ids.add(document_id) yield file def _manage_oauth_retrieval( @@ -1138,6 +1140,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e raise e checkpoint.retrieved_folder_and_drive_ids = self._retrieved_folder_and_drive_ids + + logger.info( + f"num drive files retrieved: {len(checkpoint.all_retrieved_file_ids)}" + ) if checkpoint.completion_stage == DriveRetrievalStage.DONE: checkpoint.has_more = False return checkpoint @@ -1186,6 +1192,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck end=end, callback=callback, ) + logger.info("Drive perm sync: Slim doc retrieval complete") except Exception as e: if MISSING_SCOPES_ERROR_STR in str(e): diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index e50d7e70269..1ed641ef42c 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -62,6 +62,10 @@ GOOGLE_MIME_TYPES = { } +def onyx_document_id_from_drive_file(file: GoogleDriveFileType) -> str: + return file[WEB_VIEW_LINK_KEY] + + def _summarize_drive_image( image_data: bytes, image_name: str, image_analysis_llm: LLM | None ) -> str: @@ -380,7 +384,6 @@ def _convert_drive_item_to_document( """ Main entry point for converting a Google Drive file => Document object. """ - doc_id = file.get(WEB_VIEW_LINK_KEY, "") sections: list[TextSection | ImageSection] = [] # Only construct these services when needed drive_service = lazy_eval( @@ -389,6 +392,7 @@ def _convert_drive_item_to_document( docs_service = lazy_eval( lambda: get_google_docs_service(creds, user_email=retriever_email) ) + doc_id = "unknown" try: # skip shortcuts or folders @@ -441,7 +445,7 @@ def _convert_drive_item_to_document( logger.warning(f"No content extracted from {file.get('name')}. Skipping.") return None - doc_id = file[WEB_VIEW_LINK_KEY] + doc_id = onyx_document_id_from_drive_file(file) # Create the document return Document( @@ -488,7 +492,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]: return None return SlimDocument( - id=file[WEB_VIEW_LINK_KEY], + id=onyx_document_id_from_drive_file(file), perm_sync_data={ "doc_id": file.get("id"), "drive_id": file.get("driveId"), diff --git a/backend/onyx/document_index/vespa_constants.py b/backend/onyx/document_index/vespa_constants.py index da82ed9287b..87e552c9cfb 100644 --- a/backend/onyx/document_index/vespa_constants.py +++ b/backend/onyx/document_index/vespa_constants.py @@ -36,7 +36,7 @@ MAX_OR_CONDITIONS = 10 # up from 500ms for now, since we've seen quite a few timeouts # in the long term, we are looking to improve the performance of Vespa # so that we can bring this back to default -VESPA_TIMEOUT = "3s" +VESPA_TIMEOUT = "10s" BATCH_SIZE = 128 # Specific to Vespa TENANT_ID = "tenant_id" diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index b0711641865..3197942e922 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -301,7 +301,7 @@ def read_pdf_file( def docx_to_text_and_images( - file: IO[Any], + file: IO[Any], file_name: str = "" ) -> tuple[str, Sequence[tuple[bytes, str]]]: """ Extract text from a docx. If embed_images=True, also extract inline images. @@ -310,7 +310,11 @@ def docx_to_text_and_images( paragraphs = [] embedded_images: list[tuple[bytes, str]] = [] - doc = docx.Document(file) + try: + doc = docx.Document(file) + except BadZipFile as e: + logger.warning(f"Failed to extract text from {file_name or 'docx file'}: {e}") + return "", [] # Grab text from paragraphs for paragraph in doc.paragraphs: diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index 5f3a028d03d..db0d0e0ebea 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -378,6 +378,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always environment: diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml index 5e921037206..bf0710e44fd 100644 --- a/deployment/docker_compose/docker-compose.gpu-dev.yml +++ b/deployment/docker_compose/docker-compose.gpu-dev.yml @@ -324,6 +324,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always environment: diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml index 53273dc40cb..8f2b06edb1d 100644 --- a/deployment/docker_compose/docker-compose.multitenant-dev.yml +++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml @@ -351,6 +351,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always environment: diff --git a/deployment/docker_compose/docker-compose.prod-cloud.yml b/deployment/docker_compose/docker-compose.prod-cloud.yml index 37a032f1d38..452755d3aef 100644 --- a/deployment/docker_compose/docker-compose.prod-cloud.yml +++ b/deployment/docker_compose/docker-compose.prod-cloud.yml @@ -88,6 +88,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file diff --git a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml index b8d103fa90b..3f7efe88b19 100644 --- a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml +++ b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml @@ -166,6 +166,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file diff --git a/deployment/docker_compose/docker-compose.prod.yml b/deployment/docker_compose/docker-compose.prod.yml index 30f00b97fd9..e46319ba1e2 100644 --- a/deployment/docker_compose/docker-compose.prod.yml +++ b/deployment/docker_compose/docker-compose.prod.yml @@ -120,6 +120,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always # POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file @@ -193,8 +194,10 @@ services: # This container name cannot have an underscore in it due to Vespa expectations of the URL index: - image: vespaengine/vespa:8.277.17 + image: vespaengine/vespa:8.524.25 restart: always + environment: + - VESPA_SKIP_UPGRADE_CHECK=true ports: - "19071:19071" - "8081:8081" diff --git a/deployment/docker_compose/docker-compose.search-testing.yml b/deployment/docker_compose/docker-compose.search-testing.yml index 8d60614878a..3a53fbda504 100644 --- a/deployment/docker_compose/docker-compose.search-testing.yml +++ b/deployment/docker_compose/docker-compose.search-testing.yml @@ -148,6 +148,7 @@ services: relational_db: image: postgres:15.2-alpine + shm_size: 1g command: -c 'max_connections=250' restart: always environment: diff --git a/deployment/helm/charts/onyx/values.yaml b/deployment/helm/charts/onyx/values.yaml index f0e68205e4b..bf0c69da404 100644 --- a/deployment/helm/charts/onyx/values.yaml +++ b/deployment/helm/charts/onyx/values.yaml @@ -7,6 +7,9 @@ postgresql: persistence: storageClass: "" size: 5Gi + shmVolume: + enabled: true + sizeLimit: 1Gi enabled: true auth: existingSecret: onyx-secrets