Properly account for anonymous access in Confluence

This commit is contained in:
hagen-danswer 2025-01-05 12:11:04 -08:00 committed by Chris Weaver
parent ddec239fef
commit e100a5e965
5 changed files with 96 additions and 19 deletions

View File

@ -15,6 +15,12 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/onyx/configs/saml_co
CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
)
# This is a boolean that determines if anonymous access is public
# Default behavior is to not make the page public and instead add a group
# that contains all the users that we found in Confluence
CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
os.environ.get("CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC", "").lower() == "true"
)
# In seconds, default is 5 minutes
CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60

View File

@ -0,0 +1,4 @@
# This is a group that we use to store all the users that we found in Confluence
# Instead of setting a page to public, we just add this group so that the page
# is only accessible to users who have confluence accounts.
ALL_CONF_EMAILS_GROUP_NAME = "All_Confluence_Users_Found_By_Onyx"

View File

@ -4,6 +4,8 @@ https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.htm
"""
from typing import Any
from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.confluence.connector import ConfluenceConnector
@ -31,14 +33,32 @@ def _get_server_space_permissions(
permission_category.get("spacePermissions", [])
)
is_public = False
user_names = set()
group_names = set()
for permission in viewspace_permissions:
if user_name := permission.get("userName"):
user_name = permission.get("userName")
if user_name:
user_names.add(user_name)
if group_name := permission.get("groupName"):
group_name = permission.get("groupName")
if group_name:
group_names.add(group_name)
# It seems that if anonymous access is turned on for the site and space,
# then the space is publicly accessible.
# For confluence server, we make a group that contains all users
# that exist in confluence and then just add that group to the space permissions
# if anonymous access is turned on for the site and space or we set is_public = True
# if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so
# that we can support confluence server deployments that want anonymous access
# to be public (we cant test this because its paywalled)
if user_name is None and group_name is None:
# Defaults to False
if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC:
is_public = True
else:
group_names.add(ALL_CONF_EMAILS_GROUP_NAME)
user_emails = set()
for user_name in user_names:
user_email = get_user_email_from_username__server(confluence_client, user_name)
@ -50,11 +70,7 @@ def _get_server_space_permissions(
return ExternalAccess(
external_user_emails=user_emails,
external_user_group_ids=group_names,
# TODO: Check if the space is publicly accessible
# Currently, we assume the space is not public
# We need to check if anonymous access is turned on for the site and space
# This information is paywalled so it remains unimplemented
is_public=False,
is_public=is_public,
)
@ -134,7 +150,7 @@ def _get_space_permissions(
def _extract_read_access_restrictions(
confluence_client: OnyxConfluence, restrictions: dict[str, Any]
) -> ExternalAccess | None:
) -> tuple[set[str], set[str]]:
"""
Converts a page's restrictions dict into an ExternalAccess object.
If there are no restrictions, then return None
@ -177,21 +193,57 @@ def _extract_read_access_restrictions(
group["name"] for group in read_access_group_jsons if group.get("name")
]
return set(read_access_user_emails), set(read_access_group_names)
def _get_all_page_restrictions(
confluence_client: OnyxConfluence,
perm_sync_data: dict[str, Any],
) -> ExternalAccess | None:
"""
This function gets the restrictions for a page by taking the intersection
of the page's restrictions and the restrictions of all the ancestors
of the page.
If the page/ancestor has no restrictions, then it is ignored (no intersection).
If no restrictions are found anywhere, then return None, indicating that the page
should inherit the space's restrictions.
"""
found_user_emails: set[str] = set()
found_group_names: set[str] = set()
found_user_emails, found_group_names = _extract_read_access_restrictions(
confluence_client=confluence_client,
restrictions=perm_sync_data.get("restrictions", {}),
)
ancestors: list[dict[str, Any]] = perm_sync_data.get("ancestors", [])
for ancestor in ancestors:
ancestor_user_emails, ancestor_group_names = _extract_read_access_restrictions(
confluence_client=confluence_client,
restrictions=ancestor.get("restrictions", {}),
)
if not ancestor_user_emails and not ancestor_group_names:
# This ancestor has no restrictions, so it has no effect on
# the page's restrictions, so we ignore it
continue
found_user_emails.intersection_update(ancestor_user_emails)
found_group_names.intersection_update(ancestor_group_names)
# If there are no restrictions found, then the page
# inherits the space's restrictions so return None
is_space_public = read_access_user_emails == [] and read_access_group_names == []
if is_space_public:
if not found_user_emails and not found_group_names:
return None
return ExternalAccess(
external_user_emails=set(read_access_user_emails),
external_user_group_ids=set(read_access_group_names),
external_user_emails=found_user_emails,
external_user_group_ids=found_group_names,
# there is no way for a page to be individually public if the space isn't public
is_public=False,
)
def _fetch_all_page_restrictions_for_space(
def _fetch_all_page_restrictions(
confluence_client: OnyxConfluence,
slim_docs: list[SlimDocument],
space_permissions_by_space_key: dict[str, ExternalAccess],
@ -208,11 +260,11 @@ def _fetch_all_page_restrictions_for_space(
raise ValueError(
f"No permission sync data found for document {slim_doc.id}"
)
restrictions = _extract_read_access_restrictions(
if restrictions := _get_all_page_restrictions(
confluence_client=confluence_client,
restrictions=slim_doc.perm_sync_data.get("restrictions", {}),
)
if restrictions:
perm_sync_data=slim_doc.perm_sync_data,
):
document_restrictions.append(
DocExternalAccess(
doc_id=slim_doc.id,
@ -301,7 +353,7 @@ def confluence_doc_sync(
slim_docs.extend(doc_batch)
logger.debug("Fetching all page restrictions for space")
return _fetch_all_page_restrictions_for_space(
return _fetch_all_page_restrictions(
confluence_client=confluence_connector.confluence_client,
slim_docs=slim_docs,
space_permissions_by_space_key=space_permissions_by_space_key,

View File

@ -1,11 +1,11 @@
from ee.onyx.db.external_perm import ExternalUserGroup
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
from onyx.connectors.confluence.onyx_confluence import build_confluence_client
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.connectors.confluence.utils import get_user_email_from_username__server
from onyx.db.models import ConnectorCredentialPair
from onyx.utils.logger import setup_logger
logger = setup_logger()
@ -53,6 +53,7 @@ def confluence_group_sync(
confluence_client=confluence_client,
)
onyx_groups: list[ExternalUserGroup] = []
all_found_emails = set()
for group_id, group_member_emails in group_member_email_map.items():
onyx_groups.append(
ExternalUserGroup(
@ -60,5 +61,15 @@ def confluence_group_sync(
user_emails=list(group_member_emails),
)
)
all_found_emails.update(group_member_emails)
# This is so that when we find a public confleunce server page, we can
# give access to all users only in if they have an email in Confluence
if cc_pair.connector.connector_specific_config.get("is_cloud", False):
all_found_group = ExternalUserGroup(
id=ALL_CONF_EMAILS_GROUP_NAME,
user_emails=list(all_found_emails),
)
onyx_groups.append(all_found_group)
return onyx_groups

View File

@ -52,6 +52,8 @@ _RESTRICTIONS_EXPANSION_FIELDS = [
"space",
"restrictions.read.restrictions.user",
"restrictions.read.restrictions.group",
"ancestors.restrictions.read.restrictions.user",
"ancestors.restrictions.read.restrictions.group",
]
_SLIM_DOC_BATCH_SIZE = 5000
@ -323,9 +325,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
# These will be used by doc_sync.py to sync permissions
page_restrictions = page.get("restrictions")
page_space_key = page.get("space", {}).get("key")
page_ancestors = page.get("ancestors", [])
page_perm_sync_data = {
"restrictions": page_restrictions or {},
"space_key": page_space_key,
"ancestors": page_ancestors or [],
}
doc_metadata_list.append(