mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-12 14:12:53 +02:00
drive perm sync logs + misc deployment improvements (#4788)
* some logs * give postgress more memory * give postgress more memory * give postgress more memory * revert * give postgress more memory * bump external access limit * vespa timeout * deployment consistency * bump vespa version * skip upgrade check * retry permission by ids * logs * fix temp docx file issue * fix drive file deduping * RK comments * mypy * aggregate logs
This commit is contained in:
@ -200,7 +200,9 @@ def gdrive_doc_sync(
|
||||
|
||||
slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)
|
||||
|
||||
total_processed = 0
|
||||
for slim_doc_batch in slim_doc_generator:
|
||||
logger.info(f"Drive perm sync: Processing {len(slim_doc_batch)} documents")
|
||||
for slim_doc in slim_doc_batch:
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
@ -216,3 +218,5 @@ def gdrive_doc_sync(
|
||||
external_access=ext_access,
|
||||
doc_id=slim_doc.id,
|
||||
)
|
||||
total_processed += len(slim_doc_batch)
|
||||
logger.info(f"Drive perm sync: Processed {total_processed} total documents")
|
||||
|
@ -1,3 +1,5 @@
|
||||
from retry import retry
|
||||
|
||||
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
|
||||
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from onyx.connectors.google_utils.resources import RefreshableDriveObject
|
||||
@ -6,6 +8,7 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@retry(tries=3, delay=2, backoff=2)
|
||||
def get_permissions_by_ids(
|
||||
drive_service: RefreshableDriveObject,
|
||||
doc_id: str,
|
||||
|
@ -11,7 +11,7 @@ class ExternalAccess:
|
||||
|
||||
# arbitrary limit to prevent excessively large permissions sets
|
||||
# not internally enforced ... the caller can check this before using the instance
|
||||
MAX_NUM_ENTRIES = 1000
|
||||
MAX_NUM_ENTRIES = 5000
|
||||
|
||||
# Emails of external users with access to the doc externally
|
||||
external_user_emails: set[str]
|
||||
|
@ -27,6 +27,7 @@ from onyx.connectors.google_drive.doc_conversion import build_slim_document
|
||||
from onyx.connectors.google_drive.doc_conversion import (
|
||||
convert_drive_item_to_document,
|
||||
)
|
||||
from onyx.connectors.google_drive.doc_conversion import onyx_document_id_from_drive_file
|
||||
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
|
||||
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
|
||||
from onyx.connectors.google_drive.file_retrieval import (
|
||||
@ -922,8 +923,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
||||
).timestamp(),
|
||||
current_folder_or_drive_id=file.parent_id,
|
||||
)
|
||||
if file.drive_file["id"] not in checkpoint.all_retrieved_file_ids:
|
||||
checkpoint.all_retrieved_file_ids.add(file.drive_file["id"])
|
||||
document_id = onyx_document_id_from_drive_file(file.drive_file)
|
||||
if document_id not in checkpoint.all_retrieved_file_ids:
|
||||
checkpoint.all_retrieved_file_ids.add(document_id)
|
||||
yield file
|
||||
|
||||
def _manage_oauth_retrieval(
|
||||
@ -1138,6 +1140,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
checkpoint.retrieved_folder_and_drive_ids = self._retrieved_folder_and_drive_ids
|
||||
|
||||
logger.info(
|
||||
f"num drive files retrieved: {len(checkpoint.all_retrieved_file_ids)}"
|
||||
)
|
||||
if checkpoint.completion_stage == DriveRetrievalStage.DONE:
|
||||
checkpoint.has_more = False
|
||||
return checkpoint
|
||||
@ -1186,6 +1192,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
||||
end=end,
|
||||
callback=callback,
|
||||
)
|
||||
logger.info("Drive perm sync: Slim doc retrieval complete")
|
||||
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
|
@ -62,6 +62,10 @@ GOOGLE_MIME_TYPES = {
|
||||
}
|
||||
|
||||
|
||||
def onyx_document_id_from_drive_file(file: GoogleDriveFileType) -> str:
|
||||
return file[WEB_VIEW_LINK_KEY]
|
||||
|
||||
|
||||
def _summarize_drive_image(
|
||||
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
|
||||
) -> str:
|
||||
@ -380,7 +384,6 @@ def _convert_drive_item_to_document(
|
||||
"""
|
||||
Main entry point for converting a Google Drive file => Document object.
|
||||
"""
|
||||
doc_id = file.get(WEB_VIEW_LINK_KEY, "")
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
# Only construct these services when needed
|
||||
drive_service = lazy_eval(
|
||||
@ -389,6 +392,7 @@ def _convert_drive_item_to_document(
|
||||
docs_service = lazy_eval(
|
||||
lambda: get_google_docs_service(creds, user_email=retriever_email)
|
||||
)
|
||||
doc_id = "unknown"
|
||||
|
||||
try:
|
||||
# skip shortcuts or folders
|
||||
@ -441,7 +445,7 @@ def _convert_drive_item_to_document(
|
||||
logger.warning(f"No content extracted from {file.get('name')}. Skipping.")
|
||||
return None
|
||||
|
||||
doc_id = file[WEB_VIEW_LINK_KEY]
|
||||
doc_id = onyx_document_id_from_drive_file(file)
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
@ -488,7 +492,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
|
||||
return None
|
||||
return SlimDocument(
|
||||
id=file[WEB_VIEW_LINK_KEY],
|
||||
id=onyx_document_id_from_drive_file(file),
|
||||
perm_sync_data={
|
||||
"doc_id": file.get("id"),
|
||||
"drive_id": file.get("driveId"),
|
||||
|
@ -36,7 +36,7 @@ MAX_OR_CONDITIONS = 10
|
||||
# up from 500ms for now, since we've seen quite a few timeouts
|
||||
# in the long term, we are looking to improve the performance of Vespa
|
||||
# so that we can bring this back to default
|
||||
VESPA_TIMEOUT = "3s"
|
||||
VESPA_TIMEOUT = "10s"
|
||||
BATCH_SIZE = 128 # Specific to Vespa
|
||||
|
||||
TENANT_ID = "tenant_id"
|
||||
|
@ -301,7 +301,7 @@ def read_pdf_file(
|
||||
|
||||
|
||||
def docx_to_text_and_images(
|
||||
file: IO[Any],
|
||||
file: IO[Any], file_name: str = ""
|
||||
) -> tuple[str, Sequence[tuple[bytes, str]]]:
|
||||
"""
|
||||
Extract text from a docx. If embed_images=True, also extract inline images.
|
||||
@ -310,7 +310,11 @@ def docx_to_text_and_images(
|
||||
paragraphs = []
|
||||
embedded_images: list[tuple[bytes, str]] = []
|
||||
|
||||
doc = docx.Document(file)
|
||||
try:
|
||||
doc = docx.Document(file)
|
||||
except BadZipFile as e:
|
||||
logger.warning(f"Failed to extract text from {file_name or 'docx file'}: {e}")
|
||||
return "", []
|
||||
|
||||
# Grab text from paragraphs
|
||||
for paragraph in doc.paragraphs:
|
||||
|
@ -378,6 +378,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
environment:
|
||||
|
@ -324,6 +324,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
environment:
|
||||
|
@ -351,6 +351,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
environment:
|
||||
|
@ -88,6 +88,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
|
||||
|
@ -166,6 +166,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
|
||||
|
@ -120,6 +120,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
# POSTGRES_USER and POSTGRES_PASSWORD should be set in .env file
|
||||
@ -193,8 +194,10 @@ services:
|
||||
|
||||
# This container name cannot have an underscore in it due to Vespa expectations of the URL
|
||||
index:
|
||||
image: vespaengine/vespa:8.277.17
|
||||
image: vespaengine/vespa:8.524.25
|
||||
restart: always
|
||||
environment:
|
||||
- VESPA_SKIP_UPGRADE_CHECK=true
|
||||
ports:
|
||||
- "19071:19071"
|
||||
- "8081:8081"
|
||||
|
@ -148,6 +148,7 @@ services:
|
||||
|
||||
relational_db:
|
||||
image: postgres:15.2-alpine
|
||||
shm_size: 1g
|
||||
command: -c 'max_connections=250'
|
||||
restart: always
|
||||
environment:
|
||||
|
@ -7,6 +7,9 @@ postgresql:
|
||||
persistence:
|
||||
storageClass: ""
|
||||
size: 5Gi
|
||||
shmVolume:
|
||||
enabled: true
|
||||
sizeLimit: 1Gi
|
||||
enabled: true
|
||||
auth:
|
||||
existingSecret: onyx-secrets
|
||||
|
Reference in New Issue
Block a user