mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-10 17:50:50 +02:00
* first cut at slack oauth flow * fix usage of hooks * fix button spacing * add additional error logging * no dev redirect * early cut at google drive oauth * second pass * switch to production uri's * try handling oauth_interactive differently * pass through client id and secret if uploaded * fix call * fix test * temporarily disable check for testing * Revert "temporarily disable check for testing" This reverts commit 4b5a022a5fe38b05355a561616068af8e969def2. * support visibility in test * missed file * first cut at confluence oauth * work in progress * work in progress * work in progress * work in progress * work in progress * first cut at distributed locking * WIP to make test work * add some dev mode affordances and gate usage of redis behind dynamic credentials * mypy and credentials provider fixes * WIP * fix created at * fix setting initialValue on everything * remove debugging, fix ??? some TextFormField issues * npm fixes * comment cleanup * fix comments * pin the size of the card section * more review fixes * more fixes --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com> Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
397 lines
15 KiB
Python
397 lines
15 KiB
Python
"""
|
|
Rules defined here:
|
|
https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.html
|
|
"""
|
|
from typing import Any
|
|
|
|
from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
|
|
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
|
|
from onyx.access.models import DocExternalAccess
|
|
from onyx.access.models import ExternalAccess
|
|
from onyx.connectors.confluence.connector import ConfluenceConnector
|
|
from onyx.connectors.confluence.onyx_confluence import (
|
|
get_user_email_from_username__server,
|
|
)
|
|
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
|
|
from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
|
|
from onyx.connectors.models import SlimDocument
|
|
from onyx.db.models import ConnectorCredentialPair
|
|
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
|
from onyx.utils.logger import setup_logger
|
|
from shared_configs.contextvars import get_current_tenant_id
|
|
|
|
logger = setup_logger()
|
|
|
|
_VIEWSPACE_PERMISSION_TYPE = "VIEWSPACE"
|
|
_REQUEST_PAGINATION_LIMIT = 5000
|
|
|
|
|
|
def _get_server_space_permissions(
|
|
confluence_client: OnyxConfluence, space_key: str
|
|
) -> ExternalAccess:
|
|
space_permissions = confluence_client.get_all_space_permissions_server(
|
|
space_key=space_key
|
|
)
|
|
|
|
viewspace_permissions = []
|
|
for permission_category in space_permissions:
|
|
if permission_category.get("type") == _VIEWSPACE_PERMISSION_TYPE:
|
|
viewspace_permissions.extend(
|
|
permission_category.get("spacePermissions", [])
|
|
)
|
|
|
|
is_public = False
|
|
user_names = set()
|
|
group_names = set()
|
|
for permission in viewspace_permissions:
|
|
user_name = permission.get("userName")
|
|
if user_name:
|
|
user_names.add(user_name)
|
|
group_name = permission.get("groupName")
|
|
if group_name:
|
|
group_names.add(group_name)
|
|
|
|
# It seems that if anonymous access is turned on for the site and space,
|
|
# then the space is publicly accessible.
|
|
# For confluence server, we make a group that contains all users
|
|
# that exist in confluence and then just add that group to the space permissions
|
|
# if anonymous access is turned on for the site and space or we set is_public = True
|
|
# if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so
|
|
# that we can support confluence server deployments that want anonymous access
|
|
# to be public (we cant test this because its paywalled)
|
|
if user_name is None and group_name is None:
|
|
# Defaults to False
|
|
if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC:
|
|
is_public = True
|
|
else:
|
|
group_names.add(ALL_CONF_EMAILS_GROUP_NAME)
|
|
|
|
user_emails = set()
|
|
for user_name in user_names:
|
|
user_email = get_user_email_from_username__server(confluence_client, user_name)
|
|
if user_email:
|
|
user_emails.add(user_email)
|
|
else:
|
|
logger.warning(f"Email for user {user_name} not found in Confluence")
|
|
|
|
if not user_emails and not group_names:
|
|
logger.warning(
|
|
"No user emails or group names found in Confluence space permissions"
|
|
f"\nSpace key: {space_key}"
|
|
f"\nSpace permissions: {space_permissions}"
|
|
)
|
|
|
|
return ExternalAccess(
|
|
external_user_emails=user_emails,
|
|
external_user_group_ids=group_names,
|
|
is_public=is_public,
|
|
)
|
|
|
|
|
|
def _get_cloud_space_permissions(
|
|
confluence_client: OnyxConfluence, space_key: str
|
|
) -> ExternalAccess:
|
|
space_permissions_result = confluence_client.get_space(
|
|
space_key=space_key, expand="permissions"
|
|
)
|
|
space_permissions = space_permissions_result.get("permissions", [])
|
|
|
|
user_emails = set()
|
|
group_names = set()
|
|
is_externally_public = False
|
|
for permission in space_permissions:
|
|
subs = permission.get("subjects")
|
|
if subs:
|
|
# If there are subjects, then there are explicit users or groups with access
|
|
if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
|
|
user_emails.add(email)
|
|
if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
|
|
group_names.add(group_name)
|
|
else:
|
|
# If there are no subjects, then the permission is for everyone
|
|
if permission.get("operation", {}).get(
|
|
"operation"
|
|
) == "read" and permission.get("anonymousAccess", False):
|
|
# If the permission specifies read access for anonymous users, then
|
|
# the space is publicly accessible
|
|
is_externally_public = True
|
|
|
|
return ExternalAccess(
|
|
external_user_emails=user_emails,
|
|
external_user_group_ids=group_names,
|
|
is_public=is_externally_public,
|
|
)
|
|
|
|
|
|
def _get_space_permissions(
|
|
confluence_client: OnyxConfluence,
|
|
is_cloud: bool,
|
|
) -> dict[str, ExternalAccess]:
|
|
logger.debug("Getting space permissions")
|
|
# Gets all the spaces in the Confluence instance
|
|
all_space_keys = []
|
|
start = 0
|
|
while True:
|
|
spaces_batch = confluence_client.get_all_spaces(
|
|
start=start, limit=_REQUEST_PAGINATION_LIMIT
|
|
)
|
|
for space in spaces_batch.get("results", []):
|
|
all_space_keys.append(space.get("key"))
|
|
|
|
if len(spaces_batch.get("results", [])) < _REQUEST_PAGINATION_LIMIT:
|
|
break
|
|
|
|
start += len(spaces_batch.get("results", []))
|
|
|
|
# Gets the permissions for each space
|
|
logger.debug(f"Got {len(all_space_keys)} spaces from confluence")
|
|
space_permissions_by_space_key: dict[str, ExternalAccess] = {}
|
|
for space_key in all_space_keys:
|
|
if is_cloud:
|
|
space_permissions = _get_cloud_space_permissions(
|
|
confluence_client=confluence_client, space_key=space_key
|
|
)
|
|
else:
|
|
space_permissions = _get_server_space_permissions(
|
|
confluence_client=confluence_client, space_key=space_key
|
|
)
|
|
|
|
# Stores the permissions for each space
|
|
space_permissions_by_space_key[space_key] = space_permissions
|
|
|
|
return space_permissions_by_space_key
|
|
|
|
|
|
def _extract_read_access_restrictions(
|
|
confluence_client: OnyxConfluence, restrictions: dict[str, Any]
|
|
) -> tuple[set[str], set[str]]:
|
|
"""
|
|
Converts a page's restrictions dict into an ExternalAccess object.
|
|
If there are no restrictions, then return None
|
|
"""
|
|
read_access = restrictions.get("read", {})
|
|
read_access_restrictions = read_access.get("restrictions", {})
|
|
|
|
# Extract the users with read access
|
|
read_access_user = read_access_restrictions.get("user", {})
|
|
read_access_user_jsons = read_access_user.get("results", [])
|
|
read_access_user_emails = []
|
|
for user in read_access_user_jsons:
|
|
# If the user has an email, then add it to the list
|
|
if user.get("email"):
|
|
read_access_user_emails.append(user["email"])
|
|
# If the user has a username and not an email, then get the email from Confluence
|
|
elif user.get("username"):
|
|
email = get_user_email_from_username__server(
|
|
confluence_client=confluence_client, user_name=user["username"]
|
|
)
|
|
if email:
|
|
read_access_user_emails.append(email)
|
|
else:
|
|
logger.warning(
|
|
f"Email for user {user['username']} not found in Confluence"
|
|
)
|
|
else:
|
|
if user.get("email") is not None:
|
|
logger.warning(f"Cant find email for user {user.get('displayName')}")
|
|
logger.warning(
|
|
"This user needs to make their email accessible in Confluence Settings"
|
|
)
|
|
|
|
logger.warning(f"no user email or username for {user}")
|
|
|
|
# Extract the groups with read access
|
|
read_access_group = read_access_restrictions.get("group", {})
|
|
read_access_group_jsons = read_access_group.get("results", [])
|
|
read_access_group_names = [
|
|
group["name"] for group in read_access_group_jsons if group.get("name")
|
|
]
|
|
|
|
return set(read_access_user_emails), set(read_access_group_names)
|
|
|
|
|
|
def _get_all_page_restrictions(
|
|
confluence_client: OnyxConfluence,
|
|
perm_sync_data: dict[str, Any],
|
|
) -> ExternalAccess | None:
|
|
"""
|
|
This function gets the restrictions for a page by taking the intersection
|
|
of the page's restrictions and the restrictions of all the ancestors
|
|
of the page.
|
|
If the page/ancestor has no restrictions, then it is ignored (no intersection).
|
|
If no restrictions are found anywhere, then return None, indicating that the page
|
|
should inherit the space's restrictions.
|
|
"""
|
|
found_user_emails: set[str] = set()
|
|
found_group_names: set[str] = set()
|
|
|
|
found_user_emails, found_group_names = _extract_read_access_restrictions(
|
|
confluence_client=confluence_client,
|
|
restrictions=perm_sync_data.get("restrictions", {}),
|
|
)
|
|
|
|
ancestors: list[dict[str, Any]] = perm_sync_data.get("ancestors", [])
|
|
for ancestor in ancestors:
|
|
ancestor_user_emails, ancestor_group_names = _extract_read_access_restrictions(
|
|
confluence_client=confluence_client,
|
|
restrictions=ancestor.get("restrictions", {}),
|
|
)
|
|
if not ancestor_user_emails and not ancestor_group_names:
|
|
# This ancestor has no restrictions, so it has no effect on
|
|
# the page's restrictions, so we ignore it
|
|
continue
|
|
|
|
found_user_emails.intersection_update(ancestor_user_emails)
|
|
found_group_names.intersection_update(ancestor_group_names)
|
|
|
|
# If there are no restrictions found, then the page
|
|
# inherits the space's restrictions so return None
|
|
if not found_user_emails and not found_group_names:
|
|
return None
|
|
|
|
return ExternalAccess(
|
|
external_user_emails=found_user_emails,
|
|
external_user_group_ids=found_group_names,
|
|
# there is no way for a page to be individually public if the space isn't public
|
|
is_public=False,
|
|
)
|
|
|
|
|
|
def _fetch_all_page_restrictions(
|
|
confluence_client: OnyxConfluence,
|
|
slim_docs: list[SlimDocument],
|
|
space_permissions_by_space_key: dict[str, ExternalAccess],
|
|
is_cloud: bool,
|
|
callback: IndexingHeartbeatInterface | None,
|
|
) -> list[DocExternalAccess]:
|
|
"""
|
|
For all pages, if a page has restrictions, then use those restrictions.
|
|
Otherwise, use the space's restrictions.
|
|
"""
|
|
document_restrictions: list[DocExternalAccess] = []
|
|
|
|
for slim_doc in slim_docs:
|
|
if callback:
|
|
if callback.should_stop():
|
|
raise RuntimeError("confluence_doc_sync: Stop signal detected")
|
|
|
|
callback.progress("confluence_doc_sync:fetch_all_page_restrictions", 1)
|
|
|
|
if slim_doc.perm_sync_data is None:
|
|
raise ValueError(
|
|
f"No permission sync data found for document {slim_doc.id}"
|
|
)
|
|
|
|
if restrictions := _get_all_page_restrictions(
|
|
confluence_client=confluence_client,
|
|
perm_sync_data=slim_doc.perm_sync_data,
|
|
):
|
|
document_restrictions.append(
|
|
DocExternalAccess(
|
|
doc_id=slim_doc.id,
|
|
external_access=restrictions,
|
|
)
|
|
)
|
|
# If there are restrictions, then we don't need to use the space's restrictions
|
|
continue
|
|
|
|
space_key = slim_doc.perm_sync_data.get("space_key")
|
|
if not (space_permissions := space_permissions_by_space_key.get(space_key)):
|
|
logger.debug(
|
|
f"Individually fetching space permissions for space {space_key}"
|
|
)
|
|
try:
|
|
# If the space permissions are not in the cache, then fetch them
|
|
if is_cloud:
|
|
retrieved_space_permissions = _get_cloud_space_permissions(
|
|
confluence_client=confluence_client, space_key=space_key
|
|
)
|
|
else:
|
|
retrieved_space_permissions = _get_server_space_permissions(
|
|
confluence_client=confluence_client, space_key=space_key
|
|
)
|
|
space_permissions_by_space_key[space_key] = retrieved_space_permissions
|
|
space_permissions = retrieved_space_permissions
|
|
except Exception as e:
|
|
logger.warning(
|
|
f"Error fetching space permissions for space {space_key}: {e}"
|
|
)
|
|
|
|
if not space_permissions:
|
|
logger.warning(
|
|
f"No permissions found for document {slim_doc.id} in space {space_key}"
|
|
)
|
|
continue
|
|
|
|
# If there are no restrictions, then use the space's restrictions
|
|
document_restrictions.append(
|
|
DocExternalAccess(
|
|
doc_id=slim_doc.id,
|
|
external_access=space_permissions,
|
|
)
|
|
)
|
|
if (
|
|
not space_permissions.is_public
|
|
and not space_permissions.external_user_emails
|
|
and not space_permissions.external_user_group_ids
|
|
):
|
|
logger.warning(
|
|
f"Permissions are empty for document: {slim_doc.id}\n"
|
|
"This means space permissions are may be wrong for"
|
|
f" Space key: {space_key}"
|
|
)
|
|
|
|
logger.debug("Finished fetching all page restrictions for space")
|
|
return document_restrictions
|
|
|
|
|
|
def confluence_doc_sync(
|
|
cc_pair: ConnectorCredentialPair,
|
|
callback: IndexingHeartbeatInterface | None,
|
|
) -> list[DocExternalAccess]:
|
|
"""
|
|
Adds the external permissions to the documents in postgres
|
|
if the document doesn't already exists in postgres, we create
|
|
it in postgres so that when it gets created later, the permissions are
|
|
already populated
|
|
"""
|
|
logger.debug("Starting confluence doc sync")
|
|
confluence_connector = ConfluenceConnector(
|
|
**cc_pair.connector.connector_specific_config
|
|
)
|
|
|
|
provider = OnyxDBCredentialsProvider(
|
|
get_current_tenant_id(), "confluence", cc_pair.credential_id
|
|
)
|
|
confluence_connector.set_credentials_provider(provider)
|
|
|
|
is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
|
|
|
|
space_permissions_by_space_key = _get_space_permissions(
|
|
confluence_client=confluence_connector.confluence_client,
|
|
is_cloud=is_cloud,
|
|
)
|
|
|
|
slim_docs = []
|
|
logger.debug("Fetching all slim documents from confluence")
|
|
for doc_batch in confluence_connector.retrieve_all_slim_documents(
|
|
callback=callback
|
|
):
|
|
logger.debug(f"Got {len(doc_batch)} slim documents from confluence")
|
|
if callback:
|
|
if callback.should_stop():
|
|
raise RuntimeError("confluence_doc_sync: Stop signal detected")
|
|
|
|
callback.progress("confluence_doc_sync", 1)
|
|
|
|
slim_docs.extend(doc_batch)
|
|
|
|
logger.debug("Fetching all page restrictions for space")
|
|
return _fetch_all_page_restrictions(
|
|
confluence_client=confluence_connector.confluence_client,
|
|
slim_docs=slim_docs,
|
|
space_permissions_by_space_key=space_permissions_by_space_key,
|
|
is_cloud=is_cloud,
|
|
callback=callback,
|
|
)
|