mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-20 04:37:09 +02:00
Improved logging and added comments (#2763)
* Improved logging and added comments * fix exception logging * cleanup
This commit is contained in:
@@ -24,6 +24,57 @@ logger = setup_logger()
|
||||
_REQUEST_PAGINATION_LIMIT = 100
|
||||
|
||||
|
||||
def _extract_user_email(subjects: dict[str, Any]) -> str | None:
|
||||
# If the subject is a user, then return the user's email
|
||||
user = subjects.get("user", {})
|
||||
result = user.get("results", [{}])[0]
|
||||
return result.get("email")
|
||||
|
||||
|
||||
def _extract_group_name(subjects: dict[str, Any]) -> str | None:
|
||||
# If the subject is a group, then return the group's name
|
||||
group = subjects.get("group", {})
|
||||
result = group.get("results", [{}])[0]
|
||||
return result.get("name")
|
||||
|
||||
|
||||
def _is_public_read_permission(permission: dict[str, Any]) -> bool:
|
||||
# If the permission is a public read permission, then return True
|
||||
operation = permission.get("operation", {})
|
||||
operation_value = operation.get("operation")
|
||||
anonymous_access = permission.get("anonymousAccess", False)
|
||||
return operation_value == "read" and anonymous_access
|
||||
|
||||
|
||||
def _extract_read_access_restrictions(
|
||||
restrictions: dict[str, Any]
|
||||
) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
WARNING: This function includes no paginated retrieval. So if a page is private
|
||||
within the space and has over 200 users or over 200 groups with explicitly read
|
||||
access, this function will leave out some users or groups.
|
||||
200 is a large amount so it is unlikely, but just be aware.
|
||||
"""
|
||||
read_access = restrictions.get("read", {})
|
||||
read_access_restrictions = read_access.get("restrictions", {})
|
||||
|
||||
# Extract the users with read access
|
||||
read_access_user = read_access_restrictions.get("user", {})
|
||||
read_access_user_jsons = read_access_user.get("results", [])
|
||||
read_access_user_emails = [
|
||||
user["email"] for user in read_access_user_jsons if user.get("email")
|
||||
]
|
||||
|
||||
# Extract the groups with read access
|
||||
read_access_group = read_access_restrictions.get("group", {})
|
||||
read_access_group_jsons = read_access_group.get("results", [])
|
||||
read_access_group_names = [
|
||||
group["name"] for group in read_access_group_jsons if group.get("name")
|
||||
]
|
||||
|
||||
return read_access_user_emails, read_access_group_names
|
||||
|
||||
|
||||
def _get_space_permissions(
|
||||
db_session: Session,
|
||||
confluence_client: Confluence,
|
||||
@@ -33,27 +84,29 @@ def _get_space_permissions(
|
||||
confluence_client.get_space_permissions
|
||||
)
|
||||
|
||||
space_permissions = get_space_permissions(space_id).get("permissions", [])
|
||||
space_permissions_result = get_space_permissions(space_id)
|
||||
logger.debug(f"space_permissions_result: {space_permissions_result}")
|
||||
|
||||
space_permissions = space_permissions_result.get("permissions", [])
|
||||
user_emails = set()
|
||||
# Confluence enforces that group names are unique
|
||||
group_names = set()
|
||||
is_externally_public = False
|
||||
for permission in space_permissions:
|
||||
subs = permission.get("subjects")
|
||||
if subs:
|
||||
subjects = permission.get("subjects")
|
||||
if subjects:
|
||||
# If there are subjects, then there are explicit users or groups with access
|
||||
if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
|
||||
if email := _extract_user_email(subjects):
|
||||
user_emails.add(email)
|
||||
if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
|
||||
if group_name := _extract_group_name(subjects):
|
||||
group_names.add(group_name)
|
||||
else:
|
||||
# If there are no subjects, then the permission is for everyone
|
||||
if permission.get("operation", {}).get(
|
||||
"operation"
|
||||
) == "read" and permission.get("anonymousAccess", False):
|
||||
if _is_public_read_permission(permission):
|
||||
# If the permission specifies read access for anonymous users, then
|
||||
# the space is publicly accessible
|
||||
is_externally_public = True
|
||||
|
||||
batch_add_non_web_user_if_not_exists__no_commit(
|
||||
db_session=db_session, emails=list(user_emails)
|
||||
)
|
||||
@@ -64,42 +117,29 @@ def _get_space_permissions(
|
||||
)
|
||||
|
||||
|
||||
def _get_restrictions_for_page(
|
||||
def _get_page_specific_restrictions(
|
||||
db_session: Session,
|
||||
page: dict[str, Any],
|
||||
space_permissions: ExternalAccess,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
WARNING: This function includes no pagination. So if a page is private within
|
||||
the space and has over 200 users or over 200 groups with explicitly read access,
|
||||
this function will leave out some users or groups.
|
||||
200 is a large amount so it is unlikely, but just be aware.
|
||||
"""
|
||||
restrictions_json = page.get("restrictions", {})
|
||||
read_access_dict = restrictions_json.get("read", {}).get("restrictions", {})
|
||||
) -> ExternalAccess | None:
|
||||
user_emails, group_names = _extract_read_access_restrictions(
|
||||
restrictions=page.get("restrictions", {})
|
||||
)
|
||||
|
||||
read_access_user_jsons = read_access_dict.get("user", {}).get("results", [])
|
||||
read_access_group_jsons = read_access_dict.get("group", {}).get("results", [])
|
||||
# If there are no restrictions found, then the page
|
||||
# inherits the space's restrictions so return None
|
||||
is_space_public = user_emails == [] and group_names == []
|
||||
if is_space_public:
|
||||
return None
|
||||
|
||||
is_space_public = read_access_user_jsons == [] and read_access_group_jsons == []
|
||||
|
||||
if not is_space_public:
|
||||
read_access_user_emails = [
|
||||
user["email"] for user in read_access_user_jsons if user.get("email")
|
||||
]
|
||||
read_access_groups = [group["name"] for group in read_access_group_jsons]
|
||||
batch_add_non_web_user_if_not_exists__no_commit(
|
||||
db_session=db_session, emails=list(read_access_user_emails)
|
||||
)
|
||||
external_access = ExternalAccess(
|
||||
external_user_emails=set(read_access_user_emails),
|
||||
external_user_group_ids=set(read_access_groups),
|
||||
is_public=False,
|
||||
)
|
||||
else:
|
||||
external_access = space_permissions
|
||||
|
||||
return external_access
|
||||
batch_add_non_web_user_if_not_exists__no_commit(
|
||||
db_session=db_session, emails=list(user_emails)
|
||||
)
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(user_emails),
|
||||
external_user_group_ids=set(group_names),
|
||||
# there is no way for a page to be individually public if the space isn't public
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
|
||||
def _fetch_attachment_document_ids_for_page_paginated(
|
||||
@@ -182,14 +222,13 @@ def _fetch_all_page_restrictions_for_space(
|
||||
db_session: Session,
|
||||
confluence_client: Confluence,
|
||||
space_id: str,
|
||||
space_permissions: ExternalAccess,
|
||||
) -> dict[str, ExternalAccess]:
|
||||
) -> dict[str, ExternalAccess | None]:
|
||||
all_pages = _fetch_all_pages_paginated(
|
||||
confluence_client=confluence_client,
|
||||
space_id=space_id,
|
||||
)
|
||||
|
||||
document_restrictions: dict[str, ExternalAccess] = {}
|
||||
document_restrictions: dict[str, ExternalAccess | None] = {}
|
||||
for page in all_pages:
|
||||
"""
|
||||
This assigns the same permissions to all attachments of a page and
|
||||
@@ -199,24 +238,32 @@ def _fetch_all_page_restrictions_for_space(
|
||||
may not be their own standalone documents. This is likely fine as we just upsert a
|
||||
document with just permissions.
|
||||
"""
|
||||
attachment_document_ids = [
|
||||
document_ids = []
|
||||
|
||||
# Add the page's document id
|
||||
document_ids.append(
|
||||
build_confluence_document_id(
|
||||
base_url=confluence_client.url,
|
||||
content_url=page["_links"]["webui"],
|
||||
)
|
||||
]
|
||||
attachment_document_ids.extend(
|
||||
)
|
||||
|
||||
# Add the page's attachments document ids
|
||||
document_ids.extend(
|
||||
_fetch_attachment_document_ids_for_page_paginated(
|
||||
confluence_client=confluence_client, page=page
|
||||
)
|
||||
)
|
||||
page_permissions = _get_restrictions_for_page(
|
||||
|
||||
# Get the page's specific restrictions
|
||||
page_permissions = _get_page_specific_restrictions(
|
||||
db_session=db_session,
|
||||
page=page,
|
||||
space_permissions=space_permissions,
|
||||
)
|
||||
for attachment_document_id in attachment_document_ids:
|
||||
document_restrictions[attachment_document_id] = page_permissions
|
||||
|
||||
# Apply the page's specific restrictions to the page and its attachments
|
||||
for document_id in document_ids:
|
||||
document_restrictions[document_id] = page_permissions
|
||||
|
||||
return document_restrictions
|
||||
|
||||
@@ -243,12 +290,15 @@ def confluence_doc_sync(
|
||||
db_session=db_session,
|
||||
confluence_client=confluence_client,
|
||||
space_id=cc_pair.connector.connector_specific_config["space"],
|
||||
space_permissions=space_permissions,
|
||||
)
|
||||
for doc_id, ext_access in fresh_doc_permissions.items():
|
||||
for doc_id, page_specific_access in fresh_doc_permissions.items():
|
||||
# If there are no page specific restrictions, then
|
||||
# the page inherits the space's restrictions
|
||||
page_access = page_specific_access or space_permissions
|
||||
|
||||
upsert_document_external_perms__no_commit(
|
||||
db_session=db_session,
|
||||
doc_id=doc_id,
|
||||
external_access=ext_access,
|
||||
external_access=page_access,
|
||||
source_type=cc_pair.connector.source,
|
||||
)
|
||||
|
@@ -43,8 +43,8 @@ def run_external_group_permission_sync(
|
||||
|
||||
# update postgres
|
||||
db_session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating document index: {e}")
|
||||
except Exception:
|
||||
logger.exception("Error Syncing Group Permissions")
|
||||
db_session.rollback()
|
||||
|
||||
|
||||
@@ -107,6 +107,6 @@ def run_external_doc_permission_sync(
|
||||
|
||||
# update postgres
|
||||
db_session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Error Syncing Permissions: {e}")
|
||||
except Exception:
|
||||
logger.exception("Error Syncing Document Permissions")
|
||||
db_session.rollback()
|
||||
|
Reference in New Issue
Block a user