From b25668c83a32bf7870265189f428841242cbe899 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 18 Jan 2025 16:08:50 -0800 Subject: [PATCH] fixed group sync to account for changes in drive permissions (#3666) * fixed group sync to account for changes in drive permissions * mypy * addressed * reeeeeeeee --- .../google_drive/doc_sync.py | 5 +- .../google_drive/group_sync.py | 152 +++++++++++++++--- .../onyx/connectors/google_drive/connector.py | 6 +- .../connectors/google_drive/doc_conversion.py | 1 + .../connectors/google_drive/file_retrieval.py | 2 +- 5 files changed, 135 insertions(+), 31 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py index 381975219ed9..f5ff08c16912 100644 --- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py @@ -120,9 +120,12 @@ def _get_permissions_from_slim_doc( elif permission_type == "anyone": public = True + drive_id = permission_info.get("drive_id") + group_ids = group_emails | ({drive_id} if drive_id is not None else set()) + return ExternalAccess( external_user_emails=user_emails, - external_user_group_ids=group_emails, + external_user_group_ids=group_ids, is_public=public, ) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 4fc15da00d4f..7d1a27dbe915 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -1,16 +1,127 @@ from ee.onyx.db.external_perm import ExternalUserGroup from onyx.connectors.google_drive.connector import GoogleDriveConnector from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval +from onyx.connectors.google_utils.resources import AdminService from onyx.connectors.google_utils.resources import get_admin_service +from onyx.connectors.google_utils.resources import get_drive_service from onyx.db.models import ConnectorCredentialPair from onyx.utils.logger import setup_logger logger = setup_logger() +def _get_drive_members( + google_drive_connector: GoogleDriveConnector, +) -> dict[str, tuple[set[str], set[str]]]: + """ + This builds a map of drive ids to their members (group and user emails). + E.g. { + "drive_id_1": ({"group_email_1"}, {"user_email_1", "user_email_2"}), + "drive_id_2": ({"group_email_3"}, {"user_email_3"}), + } + """ + drive_ids = google_drive_connector.get_all_drive_ids() + + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]] = {} + drive_service = get_drive_service( + google_drive_connector.creds, + google_drive_connector.primary_admin_email, + ) + + for drive_id in drive_ids: + group_emails: set[str] = set() + user_emails: set[str] = set() + for permission in execute_paginated_retrieval( + drive_service.permissions().list, + list_key="permissions", + fileId=drive_id, + fields="permissions(emailAddress, type)", + supportsAllDrives=True, + ): + if permission["type"] == "group": + group_emails.add(permission["emailAddress"]) + elif permission["type"] == "user": + user_emails.add(permission["emailAddress"]) + drive_id_to_members_map[drive_id] = (group_emails, user_emails) + return drive_id_to_members_map + + +def _get_all_groups( + admin_service: AdminService, + google_domain: str, +) -> set[str]: + """ + This gets all the group emails. + """ + group_emails: set[str] = set() + for group in execute_paginated_retrieval( + admin_service.groups().list, + list_key="groups", + domain=google_domain, + fields="groups(email)", + ): + group_emails.add(group["email"]) + return group_emails + + +def _map_group_email_to_member_emails( + admin_service: AdminService, + group_emails: set[str], +) -> dict[str, set[str]]: + """ + This maps group emails to their member emails. + """ + group_to_member_map: dict[str, set[str]] = {} + for group_email in group_emails: + group_member_emails: set[str] = set() + for member in execute_paginated_retrieval( + admin_service.members().list, + list_key="members", + groupKey=group_email, + fields="members(email)", + ): + group_member_emails.add(member["email"]) + + group_to_member_map[group_email] = group_member_emails + return group_to_member_map + + +def _build_onyx_groups( + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]], + group_email_to_member_emails_map: dict[str, set[str]], +) -> list[ExternalUserGroup]: + onyx_groups: list[ExternalUserGroup] = [] + + # Convert all drive member definitions to onyx groups + # This is because having drive level access means you have + # irrevocable access to all the files in the drive. + for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items(): + all_member_emails: set[str] = user_emails + for group_email in group_emails: + all_member_emails.update(group_email_to_member_emails_map[group_email]) + onyx_groups.append( + ExternalUserGroup( + id=drive_id, + user_emails=list(all_member_emails), + ) + ) + + # Convert all group member definitions to onyx groups + for group_email, member_emails in group_email_to_member_emails_map.items(): + onyx_groups.append( + ExternalUserGroup( + id=group_email, + user_emails=list(member_emails), + ) + ) + + return onyx_groups + + def gdrive_group_sync( cc_pair: ConnectorCredentialPair, ) -> list[ExternalUserGroup]: + # Initialize connector and build credential/service objects google_drive_connector = GoogleDriveConnector( **cc_pair.connector.connector_specific_config ) @@ -19,34 +130,23 @@ def gdrive_group_sync( google_drive_connector.creds, google_drive_connector.primary_admin_email ) - onyx_groups: list[ExternalUserGroup] = [] - for group in execute_paginated_retrieval( - admin_service.groups().list, - list_key="groups", - domain=google_drive_connector.google_domain, - fields="groups(email)", - ): - # The id is the group email - group_email = group["email"] + # Get all drive members + drive_id_to_members_map = _get_drive_members(google_drive_connector) - # Gather group member emails - group_member_emails: list[str] = [] - for member in execute_paginated_retrieval( - admin_service.members().list, - list_key="members", - groupKey=group_email, - fields="members(email)", - ): - group_member_emails.append(member["email"]) + # Get all group emails + all_group_emails = _get_all_groups( + admin_service, google_drive_connector.google_domain + ) - if not group_member_emails: - continue + # Map group emails to their members + group_email_to_member_emails_map = _map_group_email_to_member_emails( + admin_service, all_group_emails + ) - onyx_groups.append( - ExternalUserGroup( - id=group_email, - user_emails=list(group_member_emails), - ) - ) + # Convert the maps to onyx groups + onyx_groups = _build_onyx_groups( + drive_id_to_members_map=drive_id_to_members_map, + group_email_to_member_emails_map=group_email_to_member_emails_map, + ) return onyx_groups diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index 9089a551bcc1..d16007f52abe 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -258,7 +258,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): user_emails.append(email) return user_emails - def _get_all_drive_ids(self) -> set[str]: + def get_all_drive_ids(self) -> set[str]: primary_drive_service = get_drive_service( creds=self.creds, user_email=self.primary_admin_email, @@ -353,7 +353,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): ) -> Iterator[GoogleDriveFileType]: all_org_emails: list[str] = self._get_all_user_emails() - all_drive_ids: set[str] = self._get_all_drive_ids() + all_drive_ids: set[str] = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() @@ -437,7 +437,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): # If all 3 are true, we already yielded from get_all_files_for_oauth return - all_drive_ids = self._get_all_drive_ids() + all_drive_ids = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() if self._requested_shared_drive_ids or self._requested_folder_ids: diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 440e576e0d32..fc89654a43fd 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -252,6 +252,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: id=file["webViewLink"], perm_sync_data={ "doc_id": file.get("id"), + "drive_id": file.get("driveId"), "permissions": file.get("permissions", []), "permission_ids": file.get("permissionIds", []), "name": file.get("name"), diff --git a/backend/onyx/connectors/google_drive/file_retrieval.py b/backend/onyx/connectors/google_drive/file_retrieval.py index da5a4bf8d070..4e459bd3bdeb 100644 --- a/backend/onyx/connectors/google_drive/file_retrieval.py +++ b/backend/onyx/connectors/google_drive/file_retrieval.py @@ -19,7 +19,7 @@ FILE_FIELDS = ( "shortcutDetails, owners(emailAddress), size)" ) SLIM_FILE_FIELDS = ( - "nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), " + "nextPageToken, files(mimeType, driveId, id, name, permissions(emailAddress, type), " "permissionIds, webViewLink, owners(emailAddress))" ) FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"