drive perm sync logs + misc deployment improvements (#4788)

* some logs

* give postgress more memory

* give postgress more memory

* give postgress more memory

* revert

* give postgress more memory

* bump external access limit

* vespa timeout

* deployment consistency

* bump vespa version

* skip upgrade check

* retry permission by ids

* logs

* fix temp docx file issue

* fix drive file deduping

* RK comments

* mypy

* aggregate logs
This commit is contained in:
Evan Lohn
2025-06-01 19:36:57 -04:00
committed by GitHub
parent b19e3a500b
commit 4c71a5f5ff
15 changed files with 44 additions and 10 deletions

View File

@ -200,7 +200,9 @@ def gdrive_doc_sync(
slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)
total_processed = 0
for slim_doc_batch in slim_doc_generator:
logger.info(f"Drive perm sync: Processing {len(slim_doc_batch)} documents")
for slim_doc in slim_doc_batch:
if callback:
if callback.should_stop():
@ -216,3 +218,5 @@ def gdrive_doc_sync(
external_access=ext_access,
doc_id=slim_doc.id,
)
total_processed += len(slim_doc_batch)
logger.info(f"Drive perm sync: Processed {total_processed} total documents")

View File

@ -1,3 +1,5 @@
from retry import retry
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
from onyx.connectors.google_utils.resources import RefreshableDriveObject
@ -6,6 +8,7 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
@retry(tries=3, delay=2, backoff=2)
def get_permissions_by_ids(
drive_service: RefreshableDriveObject,
doc_id: str,

View File

@ -11,7 +11,7 @@ class ExternalAccess:
# arbitrary limit to prevent excessively large permissions sets
# not internally enforced ... the caller can check this before using the instance
MAX_NUM_ENTRIES = 1000
MAX_NUM_ENTRIES = 5000
# Emails of external users with access to the doc externally
external_user_emails: set[str]

View File

@ -27,6 +27,7 @@ from onyx.connectors.google_drive.doc_conversion import build_slim_document
from onyx.connectors.google_drive.doc_conversion import (
convert_drive_item_to_document,
)
from onyx.connectors.google_drive.doc_conversion import onyx_document_id_from_drive_file
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
from onyx.connectors.google_drive.file_retrieval import (
@ -922,8 +923,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
).timestamp(),
current_folder_or_drive_id=file.parent_id,
)
if file.drive_file["id"] not in checkpoint.all_retrieved_file_ids:
checkpoint.all_retrieved_file_ids.add(file.drive_file["id"])
document_id = onyx_document_id_from_drive_file(file.drive_file)
if document_id not in checkpoint.all_retrieved_file_ids:
checkpoint.all_retrieved_file_ids.add(document_id)
yield file
def _manage_oauth_retrieval(
@ -1138,6 +1140,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
raise e
checkpoint.retrieved_folder_and_drive_ids = self._retrieved_folder_and_drive_ids
logger.info(
f"num drive files retrieved: {len(checkpoint.all_retrieved_file_ids)}"
)
if checkpoint.completion_stage == DriveRetrievalStage.DONE:
checkpoint.has_more = False
return checkpoint
@ -1186,6 +1192,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
end=end,
callback=callback,
)
logger.info("Drive perm sync: Slim doc retrieval complete")
except Exception as e:
if MISSING_SCOPES_ERROR_STR in str(e):

View File

@ -62,6 +62,10 @@ GOOGLE_MIME_TYPES = {
}
def onyx_document_id_from_drive_file(file: GoogleDriveFileType) -> str:
return file[WEB_VIEW_LINK_KEY]
def _summarize_drive_image(
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
) -> str:
@ -380,7 +384,6 @@ def _convert_drive_item_to_document(
"""
Main entry point for converting a Google Drive file => Document object.
"""
doc_id = file.get(WEB_VIEW_LINK_KEY, "")
sections: list[TextSection | ImageSection] = []
# Only construct these services when needed
drive_service = lazy_eval(
@ -389,6 +392,7 @@ def _convert_drive_item_to_document(
docs_service = lazy_eval(
lambda: get_google_docs_service(creds, user_email=retriever_email)
)
doc_id = "unknown"
try:
# skip shortcuts or folders
@ -441,7 +445,7 @@ def _convert_drive_item_to_document(
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
return None
doc_id = file[WEB_VIEW_LINK_KEY]
doc_id = onyx_document_id_from_drive_file(file)
# Create the document
return Document(
@ -488,7 +492,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
return None
return SlimDocument(
id=file[WEB_VIEW_LINK_KEY],
id=onyx_document_id_from_drive_file(file),
perm_sync_data={
"doc_id": file.get("id"),
"drive_id": file.get("driveId"),

View File

@ -36,7 +36,7 @@ MAX_OR_CONDITIONS = 10
# up from 500ms for now, since we've seen quite a few timeouts
# in the long term, we are looking to improve the performance of Vespa
# so that we can bring this back to default
VESPA_TIMEOUT = "3s"
VESPA_TIMEOUT = "10s"
BATCH_SIZE = 128 # Specific to Vespa
TENANT_ID = "tenant_id"

View File

@ -301,7 +301,7 @@ def read_pdf_file(
def docx_to_text_and_images(
file: IO[Any],
file: IO[Any], file_name: str = ""
) -> tuple[str, Sequence[tuple[bytes, str]]]:
"""
Extract text from a docx. If embed_images=True, also extract inline images.
@ -310,7 +310,11 @@ def docx_to_text_and_images(
paragraphs = []
embedded_images: list[tuple[bytes, str]] = []
doc = docx.Document(file)
try:
doc = docx.Document(file)
except BadZipFile as e:
logger.warning(f"Failed to extract text from {file_name or 'docx file'}: {e}")
return "", []
# Grab text from paragraphs
for paragraph in doc.paragraphs:

View File

@ -378,6 +378,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
environment:

View File

@ -324,6 +324,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
environment:

View File

@ -351,6 +351,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
environment:

View File

@ -88,6 +88,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file

View File

@ -166,6 +166,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file

View File

@ -120,6 +120,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
@ -193,8 +194,10 @@ services:
# This container name cannot have an underscore in it due to Vespa expectations of the URL
index:
image: vespaengine/vespa:8.277.17
image: vespaengine/vespa:8.524.25
restart: always
environment:
- VESPA_SKIP_UPGRADE_CHECK=true
ports:
- "19071:19071"
- "8081:8081"

View File

@ -148,6 +148,7 @@ services:
relational_db:
image: postgres:15.2-alpine
shm_size: 1g
command: -c 'max_connections=250'
restart: always
environment:

View File

@ -7,6 +7,9 @@ postgresql:
persistence:
storageClass: ""
size: 5Gi
shmVolume:
enabled: true
sizeLimit: 1Gi
enabled: true
auth:
existingSecret: onyx-secrets