mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 13:15:18 +02:00
full drive perm sync
This commit is contained in:
@@ -619,8 +619,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
|
|
||||||
if checkpoint.completion_stage == DriveRetrievalStage.USER_EMAILS:
|
if checkpoint.completion_stage == DriveRetrievalStage.USER_EMAILS:
|
||||||
all_org_emails: list[str] = self._get_all_user_emails()
|
all_org_emails: list[str] = self._get_all_user_emails()
|
||||||
if not is_slim:
|
checkpoint.user_emails = all_org_emails
|
||||||
checkpoint.user_emails = all_org_emails
|
|
||||||
checkpoint.completion_stage = DriveRetrievalStage.DRIVE_IDS
|
checkpoint.completion_stage = DriveRetrievalStage.DRIVE_IDS
|
||||||
else:
|
else:
|
||||||
if checkpoint.user_emails is None:
|
if checkpoint.user_emails is None:
|
||||||
@@ -730,9 +729,8 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
elif self.include_shared_drives:
|
elif self.include_shared_drives:
|
||||||
sorted_drive_ids = sorted(all_drive_ids)
|
sorted_drive_ids = sorted(all_drive_ids)
|
||||||
|
|
||||||
if not is_slim:
|
checkpoint.drive_ids_to_retrieve = sorted_drive_ids
|
||||||
checkpoint.drive_ids_to_retrieve = sorted_drive_ids
|
checkpoint.folder_ids_to_retrieve = sorted_folder_ids
|
||||||
checkpoint.folder_ids_to_retrieve = sorted_folder_ids
|
|
||||||
checkpoint.completion_stage = next_stage
|
checkpoint.completion_stage = next_stage
|
||||||
else:
|
else:
|
||||||
if checkpoint.drive_ids_to_retrieve is None:
|
if checkpoint.drive_ids_to_retrieve is None:
|
||||||
@@ -908,9 +906,6 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
start=start,
|
start=start,
|
||||||
end=end,
|
end=end,
|
||||||
)
|
)
|
||||||
if is_slim:
|
|
||||||
yield from drive_files
|
|
||||||
return
|
|
||||||
|
|
||||||
for file in drive_files:
|
for file in drive_files:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@@ -1146,13 +1141,14 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
|
|
||||||
def _extract_slim_docs_from_google_drive(
|
def _extract_slim_docs_from_google_drive(
|
||||||
self,
|
self,
|
||||||
|
checkpoint: GoogleDriveCheckpoint,
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
end: SecondsSinceUnixEpoch | None = None,
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
callback: IndexingHeartbeatInterface | None = None,
|
callback: IndexingHeartbeatInterface | None = None,
|
||||||
) -> GenerateSlimDocumentOutput:
|
) -> GenerateSlimDocumentOutput:
|
||||||
slim_batch = []
|
slim_batch = []
|
||||||
for file in self._fetch_drive_items(
|
for file in self._fetch_drive_items(
|
||||||
checkpoint=self.build_dummy_checkpoint(),
|
checkpoint=checkpoint,
|
||||||
is_slim=True,
|
is_slim=True,
|
||||||
start=start,
|
start=start,
|
||||||
end=end,
|
end=end,
|
||||||
@@ -1179,9 +1175,15 @@ class GoogleDriveConnector(SlimConnector, CheckpointedConnector[GoogleDriveCheck
|
|||||||
callback: IndexingHeartbeatInterface | None = None,
|
callback: IndexingHeartbeatInterface | None = None,
|
||||||
) -> GenerateSlimDocumentOutput:
|
) -> GenerateSlimDocumentOutput:
|
||||||
try:
|
try:
|
||||||
yield from self._extract_slim_docs_from_google_drive(
|
checkpoint = self.build_dummy_checkpoint()
|
||||||
start, end, callback=callback
|
while checkpoint.completion_stage != DriveRetrievalStage.DONE:
|
||||||
)
|
yield from self._extract_slim_docs_from_google_drive(
|
||||||
|
checkpoint=checkpoint,
|
||||||
|
start=start,
|
||||||
|
end=end,
|
||||||
|
callback=callback,
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||||
|
@@ -30,7 +30,7 @@ FILE_FIELDS = (
|
|||||||
)
|
)
|
||||||
SLIM_FILE_FIELDS = (
|
SLIM_FILE_FIELDS = (
|
||||||
f"nextPageToken, files(mimeType, driveId, id, name, {PERMISSION_FULL_DESCRIPTION}, "
|
f"nextPageToken, files(mimeType, driveId, id, name, {PERMISSION_FULL_DESCRIPTION}, "
|
||||||
"permissionIds, webViewLink, owners(emailAddress))"
|
"permissionIds, webViewLink, owners(emailAddress), modifiedTime)"
|
||||||
)
|
)
|
||||||
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
|
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
|
||||||
|
|
||||||
|
@@ -154,4 +154,21 @@ def test_gdrive_perm_sync_with_real_data(
|
|||||||
f"but is accessible to {emails_with_access}. Raw result: {doc_to_raw_result_mapping[doc_id]} "
|
f"but is accessible to {emails_with_access}. Raw result: {doc_to_raw_result_mapping[doc_id]} "
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Verify that we checked every file in ACCESS_MAPPING
|
||||||
|
all_expected_files = set()
|
||||||
|
for file_ids in ACCESS_MAPPING.values():
|
||||||
|
all_expected_files.update(file_ids)
|
||||||
|
|
||||||
|
checked_file_ids = {
|
||||||
|
url_to_id_mapping[doc_id]
|
||||||
|
for doc_id in doc_to_email_mapping
|
||||||
|
if doc_id in url_to_id_mapping
|
||||||
|
}
|
||||||
|
|
||||||
|
assert all_expected_files == checked_file_ids, (
|
||||||
|
f"Not all expected files were checked. "
|
||||||
|
f"Missing files: {all_expected_files - checked_file_ids}, "
|
||||||
|
f"Extra files checked: {checked_file_ids - all_expected_files}"
|
||||||
|
)
|
||||||
|
|
||||||
print(f"Checked permissions for {checked_files} files from drive_id_mapping.json")
|
print(f"Checked permissions for {checked_files} files from drive_id_mapping.json")
|
||||||
|
Reference in New Issue
Block a user