diff --git a/backend/ee/onyx/configs/app_configs.py b/backend/ee/onyx/configs/app_configs.py index b675ed4d3..9e97a0366 100644 --- a/backend/ee/onyx/configs/app_configs.py +++ b/backend/ee/onyx/configs/app_configs.py @@ -15,6 +15,12 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/onyx/configs/saml_co CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int( os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60 ) +# This is a boolean that determines if anonymous access is public +# Default behavior is to not make the page public and instead add a group +# that contains all the users that we found in Confluence +CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = ( + os.environ.get("CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC", "").lower() == "true" +) # In seconds, default is 5 minutes CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int( os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60 diff --git a/backend/ee/onyx/external_permissions/confluence/constants.py b/backend/ee/onyx/external_permissions/confluence/constants.py new file mode 100644 index 000000000..0b6ec6dca --- /dev/null +++ b/backend/ee/onyx/external_permissions/confluence/constants.py @@ -0,0 +1,4 @@ +# This is a group that we use to store all the users that we found in Confluence +# Instead of setting a page to public, we just add this group so that the page +# is only accessible to users who have confluence accounts. +ALL_CONF_EMAILS_GROUP_NAME = "All_Confluence_Users_Found_By_Onyx" diff --git a/backend/ee/onyx/external_permissions/confluence/doc_sync.py b/backend/ee/onyx/external_permissions/confluence/doc_sync.py index 3e2a44480..708be895d 100644 --- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py +++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py @@ -4,6 +4,8 @@ https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.htm """ from typing import Any +from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC +from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME from onyx.access.models import DocExternalAccess from onyx.access.models import ExternalAccess from onyx.connectors.confluence.connector import ConfluenceConnector @@ -31,14 +33,32 @@ def _get_server_space_permissions( permission_category.get("spacePermissions", []) ) + is_public = False user_names = set() group_names = set() for permission in viewspace_permissions: - if user_name := permission.get("userName"): + user_name = permission.get("userName") + if user_name: user_names.add(user_name) - if group_name := permission.get("groupName"): + group_name = permission.get("groupName") + if group_name: group_names.add(group_name) + # It seems that if anonymous access is turned on for the site and space, + # then the space is publicly accessible. + # For confluence server, we make a group that contains all users + # that exist in confluence and then just add that group to the space permissions + # if anonymous access is turned on for the site and space or we set is_public = True + # if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so + # that we can support confluence server deployments that want anonymous access + # to be public (we cant test this because its paywalled) + if user_name is None and group_name is None: + # Defaults to False + if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC: + is_public = True + else: + group_names.add(ALL_CONF_EMAILS_GROUP_NAME) + user_emails = set() for user_name in user_names: user_email = get_user_email_from_username__server(confluence_client, user_name) @@ -50,11 +70,7 @@ def _get_server_space_permissions( return ExternalAccess( external_user_emails=user_emails, external_user_group_ids=group_names, - # TODO: Check if the space is publicly accessible - # Currently, we assume the space is not public - # We need to check if anonymous access is turned on for the site and space - # This information is paywalled so it remains unimplemented - is_public=False, + is_public=is_public, ) @@ -134,7 +150,7 @@ def _get_space_permissions( def _extract_read_access_restrictions( confluence_client: OnyxConfluence, restrictions: dict[str, Any] -) -> ExternalAccess | None: +) -> tuple[set[str], set[str]]: """ Converts a page's restrictions dict into an ExternalAccess object. If there are no restrictions, then return None @@ -177,21 +193,57 @@ def _extract_read_access_restrictions( group["name"] for group in read_access_group_jsons if group.get("name") ] + return set(read_access_user_emails), set(read_access_group_names) + + +def _get_all_page_restrictions( + confluence_client: OnyxConfluence, + perm_sync_data: dict[str, Any], +) -> ExternalAccess | None: + """ + This function gets the restrictions for a page by taking the intersection + of the page's restrictions and the restrictions of all the ancestors + of the page. + If the page/ancestor has no restrictions, then it is ignored (no intersection). + If no restrictions are found anywhere, then return None, indicating that the page + should inherit the space's restrictions. + """ + found_user_emails: set[str] = set() + found_group_names: set[str] = set() + + found_user_emails, found_group_names = _extract_read_access_restrictions( + confluence_client=confluence_client, + restrictions=perm_sync_data.get("restrictions", {}), + ) + + ancestors: list[dict[str, Any]] = perm_sync_data.get("ancestors", []) + for ancestor in ancestors: + ancestor_user_emails, ancestor_group_names = _extract_read_access_restrictions( + confluence_client=confluence_client, + restrictions=ancestor.get("restrictions", {}), + ) + if not ancestor_user_emails and not ancestor_group_names: + # This ancestor has no restrictions, so it has no effect on + # the page's restrictions, so we ignore it + continue + + found_user_emails.intersection_update(ancestor_user_emails) + found_group_names.intersection_update(ancestor_group_names) + # If there are no restrictions found, then the page # inherits the space's restrictions so return None - is_space_public = read_access_user_emails == [] and read_access_group_names == [] - if is_space_public: + if not found_user_emails and not found_group_names: return None return ExternalAccess( - external_user_emails=set(read_access_user_emails), - external_user_group_ids=set(read_access_group_names), + external_user_emails=found_user_emails, + external_user_group_ids=found_group_names, # there is no way for a page to be individually public if the space isn't public is_public=False, ) -def _fetch_all_page_restrictions_for_space( +def _fetch_all_page_restrictions( confluence_client: OnyxConfluence, slim_docs: list[SlimDocument], space_permissions_by_space_key: dict[str, ExternalAccess], @@ -208,11 +260,11 @@ def _fetch_all_page_restrictions_for_space( raise ValueError( f"No permission sync data found for document {slim_doc.id}" ) - restrictions = _extract_read_access_restrictions( + + if restrictions := _get_all_page_restrictions( confluence_client=confluence_client, - restrictions=slim_doc.perm_sync_data.get("restrictions", {}), - ) - if restrictions: + perm_sync_data=slim_doc.perm_sync_data, + ): document_restrictions.append( DocExternalAccess( doc_id=slim_doc.id, @@ -301,7 +353,7 @@ def confluence_doc_sync( slim_docs.extend(doc_batch) logger.debug("Fetching all page restrictions for space") - return _fetch_all_page_restrictions_for_space( + return _fetch_all_page_restrictions( confluence_client=confluence_connector.confluence_client, slim_docs=slim_docs, space_permissions_by_space_key=space_permissions_by_space_key, diff --git a/backend/ee/onyx/external_permissions/confluence/group_sync.py b/backend/ee/onyx/external_permissions/confluence/group_sync.py index e81c84eb2..c43783340 100644 --- a/backend/ee/onyx/external_permissions/confluence/group_sync.py +++ b/backend/ee/onyx/external_permissions/confluence/group_sync.py @@ -1,11 +1,11 @@ from ee.onyx.db.external_perm import ExternalUserGroup +from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME from onyx.connectors.confluence.onyx_confluence import build_confluence_client from onyx.connectors.confluence.onyx_confluence import OnyxConfluence from onyx.connectors.confluence.utils import get_user_email_from_username__server from onyx.db.models import ConnectorCredentialPair from onyx.utils.logger import setup_logger - logger = setup_logger() @@ -53,6 +53,7 @@ def confluence_group_sync( confluence_client=confluence_client, ) onyx_groups: list[ExternalUserGroup] = [] + all_found_emails = set() for group_id, group_member_emails in group_member_email_map.items(): onyx_groups.append( ExternalUserGroup( @@ -60,5 +61,15 @@ def confluence_group_sync( user_emails=list(group_member_emails), ) ) + all_found_emails.update(group_member_emails) + + # This is so that when we find a public confleunce server page, we can + # give access to all users only in if they have an email in Confluence + if cc_pair.connector.connector_specific_config.get("is_cloud", False): + all_found_group = ExternalUserGroup( + id=ALL_CONF_EMAILS_GROUP_NAME, + user_emails=list(all_found_emails), + ) + onyx_groups.append(all_found_group) return onyx_groups diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py index 422521f0d..a067dc4a2 100644 --- a/backend/onyx/connectors/confluence/connector.py +++ b/backend/onyx/connectors/confluence/connector.py @@ -52,6 +52,8 @@ _RESTRICTIONS_EXPANSION_FIELDS = [ "space", "restrictions.read.restrictions.user", "restrictions.read.restrictions.group", + "ancestors.restrictions.read.restrictions.user", + "ancestors.restrictions.read.restrictions.group", ] _SLIM_DOC_BATCH_SIZE = 5000 @@ -323,9 +325,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): # These will be used by doc_sync.py to sync permissions page_restrictions = page.get("restrictions") page_space_key = page.get("space", {}).get("key") + page_ancestors = page.get("ancestors", []) page_perm_sync_data = { "restrictions": page_restrictions or {}, "space_key": page_space_key, + "ancestors": page_ancestors or [], } doc_metadata_list.append(