Improved logging and added comments (#2763)

* Improved logging and added comments

* fix exception logging

* cleanup
This commit is contained in:
hagen-danswer
2024-10-10 10:37:27 -07:00
committed by GitHub
parent b212b228fb
commit 101b010c5c
2 changed files with 107 additions and 57 deletions

View File

@@ -24,6 +24,57 @@ logger = setup_logger()
_REQUEST_PAGINATION_LIMIT = 100
def _extract_user_email(subjects: dict[str, Any]) -> str | None:
# If the subject is a user, then return the user's email
user = subjects.get("user", {})
result = user.get("results", [{}])[0]
return result.get("email")
def _extract_group_name(subjects: dict[str, Any]) -> str | None:
# If the subject is a group, then return the group's name
group = subjects.get("group", {})
result = group.get("results", [{}])[0]
return result.get("name")
def _is_public_read_permission(permission: dict[str, Any]) -> bool:
# If the permission is a public read permission, then return True
operation = permission.get("operation", {})
operation_value = operation.get("operation")
anonymous_access = permission.get("anonymousAccess", False)
return operation_value == "read" and anonymous_access
def _extract_read_access_restrictions(
restrictions: dict[str, Any]
) -> tuple[list[str], list[str]]:
"""
WARNING: This function includes no paginated retrieval. So if a page is private
within the space and has over 200 users or over 200 groups with explicitly read
access, this function will leave out some users or groups.
200 is a large amount so it is unlikely, but just be aware.
"""
read_access = restrictions.get("read", {})
read_access_restrictions = read_access.get("restrictions", {})
# Extract the users with read access
read_access_user = read_access_restrictions.get("user", {})
read_access_user_jsons = read_access_user.get("results", [])
read_access_user_emails = [
user["email"] for user in read_access_user_jsons if user.get("email")
]
# Extract the groups with read access
read_access_group = read_access_restrictions.get("group", {})
read_access_group_jsons = read_access_group.get("results", [])
read_access_group_names = [
group["name"] for group in read_access_group_jsons if group.get("name")
]
return read_access_user_emails, read_access_group_names
def _get_space_permissions(
db_session: Session,
confluence_client: Confluence,
@@ -33,27 +84,29 @@ def _get_space_permissions(
confluence_client.get_space_permissions
)
space_permissions = get_space_permissions(space_id).get("permissions", [])
space_permissions_result = get_space_permissions(space_id)
logger.debug(f"space_permissions_result: {space_permissions_result}")
space_permissions = space_permissions_result.get("permissions", [])
user_emails = set()
# Confluence enforces that group names are unique
group_names = set()
is_externally_public = False
for permission in space_permissions:
subs = permission.get("subjects")
if subs:
subjects = permission.get("subjects")
if subjects:
# If there are subjects, then there are explicit users or groups with access
if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
if email := _extract_user_email(subjects):
user_emails.add(email)
if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
if group_name := _extract_group_name(subjects):
group_names.add(group_name)
else:
# If there are no subjects, then the permission is for everyone
if permission.get("operation", {}).get(
"operation"
) == "read" and permission.get("anonymousAccess", False):
if _is_public_read_permission(permission):
# If the permission specifies read access for anonymous users, then
# the space is publicly accessible
is_externally_public = True
batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(user_emails)
)
@@ -64,42 +117,29 @@ def _get_space_permissions(
)
def _get_restrictions_for_page(
def _get_page_specific_restrictions(
db_session: Session,
page: dict[str, Any],
space_permissions: ExternalAccess,
) -> ExternalAccess:
"""
WARNING: This function includes no pagination. So if a page is private within
the space and has over 200 users or over 200 groups with explicitly read access,
this function will leave out some users or groups.
200 is a large amount so it is unlikely, but just be aware.
"""
restrictions_json = page.get("restrictions", {})
read_access_dict = restrictions_json.get("read", {}).get("restrictions", {})
) -> ExternalAccess | None:
user_emails, group_names = _extract_read_access_restrictions(
restrictions=page.get("restrictions", {})
)
read_access_user_jsons = read_access_dict.get("user", {}).get("results", [])
read_access_group_jsons = read_access_dict.get("group", {}).get("results", [])
# If there are no restrictions found, then the page
# inherits the space's restrictions so return None
is_space_public = user_emails == [] and group_names == []
if is_space_public:
return None
is_space_public = read_access_user_jsons == [] and read_access_group_jsons == []
if not is_space_public:
read_access_user_emails = [
user["email"] for user in read_access_user_jsons if user.get("email")
]
read_access_groups = [group["name"] for group in read_access_group_jsons]
batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(read_access_user_emails)
)
external_access = ExternalAccess(
external_user_emails=set(read_access_user_emails),
external_user_group_ids=set(read_access_groups),
is_public=False,
)
else:
external_access = space_permissions
return external_access
batch_add_non_web_user_if_not_exists__no_commit(
db_session=db_session, emails=list(user_emails)
)
return ExternalAccess(
external_user_emails=set(user_emails),
external_user_group_ids=set(group_names),
# there is no way for a page to be individually public if the space isn't public
is_public=False,
)
def _fetch_attachment_document_ids_for_page_paginated(
@@ -182,14 +222,13 @@ def _fetch_all_page_restrictions_for_space(
db_session: Session,
confluence_client: Confluence,
space_id: str,
space_permissions: ExternalAccess,
) -> dict[str, ExternalAccess]:
) -> dict[str, ExternalAccess | None]:
all_pages = _fetch_all_pages_paginated(
confluence_client=confluence_client,
space_id=space_id,
)
document_restrictions: dict[str, ExternalAccess] = {}
document_restrictions: dict[str, ExternalAccess | None] = {}
for page in all_pages:
"""
This assigns the same permissions to all attachments of a page and
@@ -199,24 +238,32 @@ def _fetch_all_page_restrictions_for_space(
may not be their own standalone documents. This is likely fine as we just upsert a
document with just permissions.
"""
attachment_document_ids = [
document_ids = []
# Add the page's document id
document_ids.append(
build_confluence_document_id(
base_url=confluence_client.url,
content_url=page["_links"]["webui"],
)
]
attachment_document_ids.extend(
)
# Add the page's attachments document ids
document_ids.extend(
_fetch_attachment_document_ids_for_page_paginated(
confluence_client=confluence_client, page=page
)
)
page_permissions = _get_restrictions_for_page(
# Get the page's specific restrictions
page_permissions = _get_page_specific_restrictions(
db_session=db_session,
page=page,
space_permissions=space_permissions,
)
for attachment_document_id in attachment_document_ids:
document_restrictions[attachment_document_id] = page_permissions
# Apply the page's specific restrictions to the page and its attachments
for document_id in document_ids:
document_restrictions[document_id] = page_permissions
return document_restrictions
@@ -243,12 +290,15 @@ def confluence_doc_sync(
db_session=db_session,
confluence_client=confluence_client,
space_id=cc_pair.connector.connector_specific_config["space"],
space_permissions=space_permissions,
)
for doc_id, ext_access in fresh_doc_permissions.items():
for doc_id, page_specific_access in fresh_doc_permissions.items():
# If there are no page specific restrictions, then
# the page inherits the space's restrictions
page_access = page_specific_access or space_permissions
upsert_document_external_perms__no_commit(
db_session=db_session,
doc_id=doc_id,
external_access=ext_access,
external_access=page_access,
source_type=cc_pair.connector.source,
)

View File

@@ -43,8 +43,8 @@ def run_external_group_permission_sync(
# update postgres
db_session.commit()
except Exception as e:
logger.error(f"Error updating document index: {e}")
except Exception:
logger.exception("Error Syncing Group Permissions")
db_session.rollback()
@@ -107,6 +107,6 @@ def run_external_doc_permission_sync(
# update postgres
db_session.commit()
except Exception as e:
logger.error(f"Error Syncing Permissions: {e}")
except Exception:
logger.exception("Error Syncing Document Permissions")
db_session.rollback()