mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-08 16:05:32 +02:00
id not set in checkpoint FINAL (#4656)
* it will never happen again. * fix perm sync issue * fix perm sync issue2 * ensure member emails map is populated * other fix for perm sync * address CW comments * nit
This commit is contained in:
@@ -68,7 +68,7 @@ def _fetch_permissions_for_permission_ids(
|
|||||||
retrieval_function=drive_service.permissions().list,
|
retrieval_function=drive_service.permissions().list,
|
||||||
list_key="permissions",
|
list_key="permissions",
|
||||||
fileId=doc_id,
|
fileId=doc_id,
|
||||||
fields="permissions(id, emailAddress, type, domain)",
|
fields="permissions(id, emailAddress, type, domain),nextPageToken",
|
||||||
supportsAllDrives=True,
|
supportsAllDrives=True,
|
||||||
continue_on_404_or_403=True,
|
continue_on_404_or_403=True,
|
||||||
)
|
)
|
||||||
|
@@ -51,7 +51,7 @@ def _get_drive_members(
|
|||||||
drive_service.permissions().list,
|
drive_service.permissions().list,
|
||||||
list_key="permissions",
|
list_key="permissions",
|
||||||
fileId=drive_id,
|
fileId=drive_id,
|
||||||
fields="permissions(emailAddress, type)",
|
fields="permissions(emailAddress, type),nextPageToken",
|
||||||
supportsAllDrives=True,
|
supportsAllDrives=True,
|
||||||
# can only set `useDomainAdminAccess` to true if the user
|
# can only set `useDomainAdminAccess` to true if the user
|
||||||
# is an admin
|
# is an admin
|
||||||
@@ -107,7 +107,7 @@ def _map_group_email_to_member_emails(
|
|||||||
admin_service.members().list,
|
admin_service.members().list,
|
||||||
list_key="members",
|
list_key="members",
|
||||||
groupKey=group_email,
|
groupKey=group_email,
|
||||||
fields="members(email)",
|
fields="members(email),nextPageToken",
|
||||||
):
|
):
|
||||||
group_member_emails.add(member["email"])
|
group_member_emails.add(member["email"])
|
||||||
|
|
||||||
@@ -127,6 +127,11 @@ def _build_onyx_groups(
|
|||||||
for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
|
for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
|
||||||
all_member_emails: set[str] = user_emails
|
all_member_emails: set[str] = user_emails
|
||||||
for group_email in group_emails:
|
for group_email in group_emails:
|
||||||
|
if group_email not in group_email_to_member_emails_map:
|
||||||
|
logger.warning(
|
||||||
|
f"Group email {group_email} not found in group_email_to_member_emails_map"
|
||||||
|
)
|
||||||
|
continue
|
||||||
all_member_emails.update(group_email_to_member_emails_map[group_email])
|
all_member_emails.update(group_email_to_member_emails_map[group_email])
|
||||||
onyx_groups.append(
|
onyx_groups.append(
|
||||||
ExternalUserGroup(
|
ExternalUserGroup(
|
||||||
|
@@ -12,6 +12,7 @@ from onyx.configs.constants import DocumentSource
|
|||||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||||
from onyx.connectors.google_utils.google_auth import get_google_creds
|
from onyx.connectors.google_utils.google_auth import get_google_creds
|
||||||
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
|
from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||||
|
from onyx.connectors.google_utils.google_utils import execute_single_retrieval
|
||||||
from onyx.connectors.google_utils.resources import get_admin_service
|
from onyx.connectors.google_utils.resources import get_admin_service
|
||||||
from onyx.connectors.google_utils.resources import get_gmail_service
|
from onyx.connectors.google_utils.resources import get_gmail_service
|
||||||
from onyx.connectors.google_utils.shared_constants import (
|
from onyx.connectors.google_utils.shared_constants import (
|
||||||
@@ -301,7 +302,7 @@ class GmailConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
q=query,
|
q=query,
|
||||||
continue_on_404_or_403=True,
|
continue_on_404_or_403=True,
|
||||||
):
|
):
|
||||||
full_threads = execute_paginated_retrieval(
|
full_threads = execute_single_retrieval(
|
||||||
retrieval_function=gmail_service.users().threads().get,
|
retrieval_function=gmail_service.users().threads().get,
|
||||||
list_key=None,
|
list_key=None,
|
||||||
userId=user_email,
|
userId=user_email,
|
||||||
|
@@ -377,7 +377,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
cv.notify_all()
|
cv.notify_all()
|
||||||
|
|
||||||
# when entering the iterator with a previous id in the checkpoint, the user
|
# when entering the iterator with a previous id in the checkpoint, the user
|
||||||
# just finished that drive from a previous run.
|
# has just finished that drive from a previous run.
|
||||||
if (
|
if (
|
||||||
completion.stage == DriveRetrievalStage.MY_DRIVE_FILES
|
completion.stage == DriveRetrievalStage.MY_DRIVE_FILES
|
||||||
and completion.current_folder_or_drive_id is not None
|
and completion.current_folder_or_drive_id is not None
|
||||||
@@ -492,9 +492,14 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
if resuming:
|
if resuming:
|
||||||
drive_id = curr_stage.current_folder_or_drive_id
|
drive_id = curr_stage.current_folder_or_drive_id
|
||||||
if drive_id is None:
|
if drive_id is None:
|
||||||
raise ValueError("drive id not set in checkpoint")
|
logger.warning(
|
||||||
resume_start = curr_stage.completed_until
|
f"drive id not set in checkpoint for user {user_email}. "
|
||||||
yield from _yield_from_drive(drive_id, resume_start)
|
"This happens occasionally when the connector is interrupted "
|
||||||
|
"and resumed."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
resume_start = curr_stage.completed_until
|
||||||
|
yield from _yield_from_drive(drive_id, resume_start)
|
||||||
# Don't enter resuming case for folder retrieval
|
# Don't enter resuming case for folder retrieval
|
||||||
resuming = False
|
resuming = False
|
||||||
|
|
||||||
@@ -536,9 +541,14 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
if resuming:
|
if resuming:
|
||||||
folder_id = curr_stage.current_folder_or_drive_id
|
folder_id = curr_stage.current_folder_or_drive_id
|
||||||
if folder_id is None:
|
if folder_id is None:
|
||||||
raise ValueError("folder id not set in checkpoint")
|
logger.warning(
|
||||||
resume_start = curr_stage.completed_until
|
f"folder id not set in checkpoint for user {user_email}. "
|
||||||
yield from _yield_from_folder_crawl(folder_id, resume_start)
|
"This happens occasionally when the connector is interrupted "
|
||||||
|
"and resumed."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
resume_start = curr_stage.completed_until
|
||||||
|
yield from _yield_from_folder_crawl(folder_id, resume_start)
|
||||||
last_processed_folder = folder_id
|
last_processed_folder = folder_id
|
||||||
|
|
||||||
skipping_seen_folders = last_processed_folder is not None
|
skipping_seen_folders = last_processed_folder is not None
|
||||||
@@ -1040,6 +1050,9 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
if len(files_batch) < self.batch_size:
|
if len(files_batch) < self.batch_size:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Yielding batch of {len(files_batch)} files; num seen doc ids: {len(checkpoint.all_retrieved_file_ids)}"
|
||||||
|
)
|
||||||
yield from _yield_batch(files_batch)
|
yield from _yield_batch(files_batch)
|
||||||
files_batch = []
|
files_batch = []
|
||||||
|
|
||||||
|
@@ -108,6 +108,54 @@ def get_file_owners(file: GoogleDriveFileType) -> list[str]:
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _execute_single_retrieval(
|
||||||
|
retrieval_function: Callable,
|
||||||
|
continue_on_404_or_403: bool = False,
|
||||||
|
**request_kwargs: Any,
|
||||||
|
) -> GoogleDriveFileType:
|
||||||
|
"""Execute a single retrieval from Google Drive API"""
|
||||||
|
try:
|
||||||
|
results = retrieval_function(**request_kwargs).execute()
|
||||||
|
except HttpError as e:
|
||||||
|
if e.resp.status >= 500:
|
||||||
|
results = add_retries(
|
||||||
|
lambda: retrieval_function(**request_kwargs).execute()
|
||||||
|
)()
|
||||||
|
elif e.resp.status == 404 or e.resp.status == 403:
|
||||||
|
if continue_on_404_or_403:
|
||||||
|
logger.debug(f"Error executing request: {e}")
|
||||||
|
results = {}
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
elif e.resp.status == 429:
|
||||||
|
results = _execute_with_retry(
|
||||||
|
lambda: retrieval_function(**request_kwargs).execute()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.exception("Error executing request:")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def execute_single_retrieval(
|
||||||
|
retrieval_function: Callable,
|
||||||
|
list_key: str | None = None,
|
||||||
|
continue_on_404_or_403: bool = False,
|
||||||
|
**request_kwargs: Any,
|
||||||
|
) -> Iterator[GoogleDriveFileType]:
|
||||||
|
results = _execute_single_retrieval(
|
||||||
|
retrieval_function,
|
||||||
|
continue_on_404_or_403,
|
||||||
|
**request_kwargs,
|
||||||
|
)
|
||||||
|
if list_key:
|
||||||
|
for item in results.get(list_key, []):
|
||||||
|
yield item
|
||||||
|
else:
|
||||||
|
yield results
|
||||||
|
|
||||||
|
|
||||||
def execute_paginated_retrieval(
|
def execute_paginated_retrieval(
|
||||||
retrieval_function: Callable,
|
retrieval_function: Callable,
|
||||||
list_key: str | None = None,
|
list_key: str | None = None,
|
||||||
@@ -119,32 +167,20 @@ def execute_paginated_retrieval(
|
|||||||
retrieval_function: The specific list function to call (e.g., service.files().list)
|
retrieval_function: The specific list function to call (e.g., service.files().list)
|
||||||
**kwargs: Arguments to pass to the list function
|
**kwargs: Arguments to pass to the list function
|
||||||
"""
|
"""
|
||||||
|
if "fields" not in kwargs or "nextPageToken" not in kwargs["fields"]:
|
||||||
|
raise ValueError(
|
||||||
|
"fields must contain nextPageToken for execute_paginated_retrieval"
|
||||||
|
)
|
||||||
next_page_token = kwargs.get(PAGE_TOKEN_KEY, "")
|
next_page_token = kwargs.get(PAGE_TOKEN_KEY, "")
|
||||||
while next_page_token is not None:
|
while next_page_token is not None:
|
||||||
request_kwargs = kwargs.copy()
|
request_kwargs = kwargs.copy()
|
||||||
if next_page_token:
|
if next_page_token:
|
||||||
request_kwargs[PAGE_TOKEN_KEY] = next_page_token
|
request_kwargs[PAGE_TOKEN_KEY] = next_page_token
|
||||||
|
results = _execute_single_retrieval(
|
||||||
try:
|
retrieval_function,
|
||||||
results = retrieval_function(**request_kwargs).execute()
|
continue_on_404_or_403,
|
||||||
except HttpError as e:
|
**request_kwargs,
|
||||||
if e.resp.status >= 500:
|
)
|
||||||
results = add_retries(
|
|
||||||
lambda: retrieval_function(**request_kwargs).execute()
|
|
||||||
)()
|
|
||||||
elif e.resp.status == 404 or e.resp.status == 403:
|
|
||||||
if continue_on_404_or_403:
|
|
||||||
logger.debug(f"Error executing request: {e}")
|
|
||||||
results = {}
|
|
||||||
else:
|
|
||||||
raise e
|
|
||||||
elif e.resp.status == 429:
|
|
||||||
results = _execute_with_retry(
|
|
||||||
lambda: retrieval_function(**request_kwargs).execute()
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.exception("Error executing request:")
|
|
||||||
raise e
|
|
||||||
|
|
||||||
next_page_token = results.get(NEXT_PAGE_TOKEN_KEY)
|
next_page_token = results.get(NEXT_PAGE_TOKEN_KEY)
|
||||||
if list_key:
|
if list_key:
|
||||||
|
@@ -101,7 +101,7 @@ def get_group_map(google_drive_connector: GoogleDriveConnector) -> dict[str, lis
|
|||||||
admin_service.groups().list,
|
admin_service.groups().list,
|
||||||
list_key="groups",
|
list_key="groups",
|
||||||
domain=google_drive_connector.google_domain,
|
domain=google_drive_connector.google_domain,
|
||||||
fields="groups(email)",
|
fields="groups(email),nextPageToken",
|
||||||
):
|
):
|
||||||
# The id is the group email
|
# The id is the group email
|
||||||
group_email = group["email"]
|
group_email = group["email"]
|
||||||
@@ -112,7 +112,7 @@ def get_group_map(google_drive_connector: GoogleDriveConnector) -> dict[str, lis
|
|||||||
admin_service.members().list,
|
admin_service.members().list,
|
||||||
list_key="members",
|
list_key="members",
|
||||||
groupKey=group_email,
|
groupKey=group_email,
|
||||||
fields="members(email)",
|
fields="members(email),nextPageToken",
|
||||||
):
|
):
|
||||||
group_member_emails.append(member["email"])
|
group_member_emails.append(member["email"])
|
||||||
group_map[group_email] = group_member_emails
|
group_map[group_email] = group_member_emails
|
||||||
|
Reference in New Issue
Block a user