From 101b010c5cf2287966e81e1821eb1787e51ebe90 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Thu, 10 Oct 2024 10:37:27 -0700 Subject: [PATCH] Improved logging and added comments (#2763) * Improved logging and added comments * fix exception logging * cleanup --- .../confluence/doc_sync.py | 156 ++++++++++++------ .../external_permissions/permission_sync.py | 8 +- 2 files changed, 107 insertions(+), 57 deletions(-) diff --git a/backend/ee/danswer/external_permissions/confluence/doc_sync.py b/backend/ee/danswer/external_permissions/confluence/doc_sync.py index b6812adb9e7a..e87dcf1d79f3 100644 --- a/backend/ee/danswer/external_permissions/confluence/doc_sync.py +++ b/backend/ee/danswer/external_permissions/confluence/doc_sync.py @@ -24,6 +24,57 @@ logger = setup_logger() _REQUEST_PAGINATION_LIMIT = 100 +def _extract_user_email(subjects: dict[str, Any]) -> str | None: + # If the subject is a user, then return the user's email + user = subjects.get("user", {}) + result = user.get("results", [{}])[0] + return result.get("email") + + +def _extract_group_name(subjects: dict[str, Any]) -> str | None: + # If the subject is a group, then return the group's name + group = subjects.get("group", {}) + result = group.get("results", [{}])[0] + return result.get("name") + + +def _is_public_read_permission(permission: dict[str, Any]) -> bool: + # If the permission is a public read permission, then return True + operation = permission.get("operation", {}) + operation_value = operation.get("operation") + anonymous_access = permission.get("anonymousAccess", False) + return operation_value == "read" and anonymous_access + + +def _extract_read_access_restrictions( + restrictions: dict[str, Any] +) -> tuple[list[str], list[str]]: + """ + WARNING: This function includes no paginated retrieval. So if a page is private + within the space and has over 200 users or over 200 groups with explicitly read + access, this function will leave out some users or groups. + 200 is a large amount so it is unlikely, but just be aware. + """ + read_access = restrictions.get("read", {}) + read_access_restrictions = read_access.get("restrictions", {}) + + # Extract the users with read access + read_access_user = read_access_restrictions.get("user", {}) + read_access_user_jsons = read_access_user.get("results", []) + read_access_user_emails = [ + user["email"] for user in read_access_user_jsons if user.get("email") + ] + + # Extract the groups with read access + read_access_group = read_access_restrictions.get("group", {}) + read_access_group_jsons = read_access_group.get("results", []) + read_access_group_names = [ + group["name"] for group in read_access_group_jsons if group.get("name") + ] + + return read_access_user_emails, read_access_group_names + + def _get_space_permissions( db_session: Session, confluence_client: Confluence, @@ -33,27 +84,29 @@ def _get_space_permissions( confluence_client.get_space_permissions ) - space_permissions = get_space_permissions(space_id).get("permissions", []) + space_permissions_result = get_space_permissions(space_id) + logger.debug(f"space_permissions_result: {space_permissions_result}") + + space_permissions = space_permissions_result.get("permissions", []) user_emails = set() # Confluence enforces that group names are unique group_names = set() is_externally_public = False for permission in space_permissions: - subs = permission.get("subjects") - if subs: + subjects = permission.get("subjects") + if subjects: # If there are subjects, then there are explicit users or groups with access - if email := subs.get("user", {}).get("results", [{}])[0].get("email"): + if email := _extract_user_email(subjects): user_emails.add(email) - if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"): + if group_name := _extract_group_name(subjects): group_names.add(group_name) else: # If there are no subjects, then the permission is for everyone - if permission.get("operation", {}).get( - "operation" - ) == "read" and permission.get("anonymousAccess", False): + if _is_public_read_permission(permission): # If the permission specifies read access for anonymous users, then # the space is publicly accessible is_externally_public = True + batch_add_non_web_user_if_not_exists__no_commit( db_session=db_session, emails=list(user_emails) ) @@ -64,42 +117,29 @@ def _get_space_permissions( ) -def _get_restrictions_for_page( +def _get_page_specific_restrictions( db_session: Session, page: dict[str, Any], - space_permissions: ExternalAccess, -) -> ExternalAccess: - """ - WARNING: This function includes no pagination. So if a page is private within - the space and has over 200 users or over 200 groups with explicitly read access, - this function will leave out some users or groups. - 200 is a large amount so it is unlikely, but just be aware. - """ - restrictions_json = page.get("restrictions", {}) - read_access_dict = restrictions_json.get("read", {}).get("restrictions", {}) +) -> ExternalAccess | None: + user_emails, group_names = _extract_read_access_restrictions( + restrictions=page.get("restrictions", {}) + ) - read_access_user_jsons = read_access_dict.get("user", {}).get("results", []) - read_access_group_jsons = read_access_dict.get("group", {}).get("results", []) + # If there are no restrictions found, then the page + # inherits the space's restrictions so return None + is_space_public = user_emails == [] and group_names == [] + if is_space_public: + return None - is_space_public = read_access_user_jsons == [] and read_access_group_jsons == [] - - if not is_space_public: - read_access_user_emails = [ - user["email"] for user in read_access_user_jsons if user.get("email") - ] - read_access_groups = [group["name"] for group in read_access_group_jsons] - batch_add_non_web_user_if_not_exists__no_commit( - db_session=db_session, emails=list(read_access_user_emails) - ) - external_access = ExternalAccess( - external_user_emails=set(read_access_user_emails), - external_user_group_ids=set(read_access_groups), - is_public=False, - ) - else: - external_access = space_permissions - - return external_access + batch_add_non_web_user_if_not_exists__no_commit( + db_session=db_session, emails=list(user_emails) + ) + return ExternalAccess( + external_user_emails=set(user_emails), + external_user_group_ids=set(group_names), + # there is no way for a page to be individually public if the space isn't public + is_public=False, + ) def _fetch_attachment_document_ids_for_page_paginated( @@ -182,14 +222,13 @@ def _fetch_all_page_restrictions_for_space( db_session: Session, confluence_client: Confluence, space_id: str, - space_permissions: ExternalAccess, -) -> dict[str, ExternalAccess]: +) -> dict[str, ExternalAccess | None]: all_pages = _fetch_all_pages_paginated( confluence_client=confluence_client, space_id=space_id, ) - document_restrictions: dict[str, ExternalAccess] = {} + document_restrictions: dict[str, ExternalAccess | None] = {} for page in all_pages: """ This assigns the same permissions to all attachments of a page and @@ -199,24 +238,32 @@ def _fetch_all_page_restrictions_for_space( may not be their own standalone documents. This is likely fine as we just upsert a document with just permissions. """ - attachment_document_ids = [ + document_ids = [] + + # Add the page's document id + document_ids.append( build_confluence_document_id( base_url=confluence_client.url, content_url=page["_links"]["webui"], ) - ] - attachment_document_ids.extend( + ) + + # Add the page's attachments document ids + document_ids.extend( _fetch_attachment_document_ids_for_page_paginated( confluence_client=confluence_client, page=page ) ) - page_permissions = _get_restrictions_for_page( + + # Get the page's specific restrictions + page_permissions = _get_page_specific_restrictions( db_session=db_session, page=page, - space_permissions=space_permissions, ) - for attachment_document_id in attachment_document_ids: - document_restrictions[attachment_document_id] = page_permissions + + # Apply the page's specific restrictions to the page and its attachments + for document_id in document_ids: + document_restrictions[document_id] = page_permissions return document_restrictions @@ -243,12 +290,15 @@ def confluence_doc_sync( db_session=db_session, confluence_client=confluence_client, space_id=cc_pair.connector.connector_specific_config["space"], - space_permissions=space_permissions, ) - for doc_id, ext_access in fresh_doc_permissions.items(): + for doc_id, page_specific_access in fresh_doc_permissions.items(): + # If there are no page specific restrictions, then + # the page inherits the space's restrictions + page_access = page_specific_access or space_permissions + upsert_document_external_perms__no_commit( db_session=db_session, doc_id=doc_id, - external_access=ext_access, + external_access=page_access, source_type=cc_pair.connector.source, ) diff --git a/backend/ee/danswer/external_permissions/permission_sync.py b/backend/ee/danswer/external_permissions/permission_sync.py index 0f07411da52e..3f5e66f875c4 100644 --- a/backend/ee/danswer/external_permissions/permission_sync.py +++ b/backend/ee/danswer/external_permissions/permission_sync.py @@ -43,8 +43,8 @@ def run_external_group_permission_sync( # update postgres db_session.commit() - except Exception as e: - logger.error(f"Error updating document index: {e}") + except Exception: + logger.exception("Error Syncing Group Permissions") db_session.rollback() @@ -107,6 +107,6 @@ def run_external_doc_permission_sync( # update postgres db_session.commit() - except Exception as e: - logger.error(f"Error Syncing Permissions: {e}") + except Exception: + logger.exception("Error Syncing Document Permissions") db_session.rollback()