mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-03 11:11:45 +02:00
Fix tons of users w/o drive access causing timeouts (#4437)
This commit is contained in:
@ -445,6 +445,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
logger.warning(
|
||||
f"User '{user_email}' does not have access to the drive APIs."
|
||||
)
|
||||
# mark this user as done so we don't try to retrieve anything for them
|
||||
# again
|
||||
curr_stage.stage = DriveRetrievalStage.DONE
|
||||
return
|
||||
raise
|
||||
|
||||
@ -581,6 +584,25 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
drive_ids_to_retrieve, checkpoint
|
||||
)
|
||||
|
||||
# only process emails that we haven't already completed retrieval for
|
||||
non_completed_org_emails = [
|
||||
user_email
|
||||
for user_email, stage in checkpoint.completion_map.items()
|
||||
if stage != DriveRetrievalStage.DONE
|
||||
]
|
||||
|
||||
# don't process too many emails before returning a checkpoint. This is
|
||||
# to resolve the case where there are a ton of emails that don't have access
|
||||
# to the drive APIs. Without this, we could loop through these emails for
|
||||
# more than 3 hours, causing a timeout and stalling progress.
|
||||
email_batch_takes_us_to_completion = True
|
||||
MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING = 50
|
||||
if len(non_completed_org_emails) > MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING:
|
||||
non_completed_org_emails = non_completed_org_emails[
|
||||
:MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING
|
||||
]
|
||||
email_batch_takes_us_to_completion = False
|
||||
|
||||
user_retrieval_gens = [
|
||||
self._impersonate_user_for_retrieval(
|
||||
email,
|
||||
@ -591,10 +613,14 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
||||
start,
|
||||
end,
|
||||
)
|
||||
for email in all_org_emails
|
||||
for email in non_completed_org_emails
|
||||
]
|
||||
yield from parallel_yield(user_retrieval_gens, max_workers=MAX_DRIVE_WORKERS)
|
||||
|
||||
# if there are more emails to process, don't mark as complete
|
||||
if not email_batch_takes_us_to_completion:
|
||||
return
|
||||
|
||||
remaining_folders = (
|
||||
drive_ids_to_retrieve | folder_ids_to_retrieve
|
||||
) - self._retrieved_ids
|
||||
|
@ -332,14 +332,15 @@ def wait_on_background(task: TimeoutThread[R]) -> R:
|
||||
return task.result
|
||||
|
||||
|
||||
def _next_or_none(ind: int, g: Iterator[R]) -> tuple[int, R | None]:
|
||||
return ind, next(g, None)
|
||||
def _next_or_none(ind: int, gen: Iterator[R]) -> tuple[int, R | None]:
|
||||
return ind, next(gen, None)
|
||||
|
||||
|
||||
def parallel_yield(gens: list[Iterator[R]], max_workers: int = 10) -> Iterator[R]:
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
future_to_index: dict[Future[tuple[int, R | None]], int] = {
|
||||
executor.submit(_next_or_none, i, g): i for i, g in enumerate(gens)
|
||||
executor.submit(_next_or_none, ind, gen): ind
|
||||
for ind, gen in enumerate(gens)
|
||||
}
|
||||
|
||||
next_ind = len(gens)
|
||||
|
Reference in New Issue
Block a user