More logging/fixes (#3364)

* More logging for external group syncing

* Fixed edge case where some spaces were not being fetched

* made refresh frequency for confluence syncs configurable

* clarity
This commit is contained in:
hagen-danswer 2024-12-06 13:56:29 -08:00 committed by GitHub
parent 53b3dcbace
commit 53428f6e9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 62 additions and 20 deletions

View File

@ -11,6 +11,14 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/danswer/configs/saml
#####
# Auto Permission Sync
#####
# In seconds, default is 5 minutes
CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
)
# In seconds, default is 5 minutes
CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
)
NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)

View File

@ -10,6 +10,9 @@ from danswer.access.utils import prefix_group_w_source
from danswer.configs.constants import DocumentSource
from danswer.db.models import User__ExternalUserGroupId
from danswer.db.users import batch_add_ext_perm_user_if_not_exists
from danswer.utils.logger import setup_logger
logger = setup_logger()
class ExternalUserGroup(BaseModel):
@ -73,7 +76,13 @@ def replace_user__ext_group_for_cc_pair(
new_external_permissions = []
for external_group in group_defs:
for user_email in external_group.user_emails:
user_id = email_id_map[user_email]
user_id = email_id_map.get(user_email)
if user_id is None:
logger.warning(
f"User in group {external_group.id}"
f" with email {user_email} not found"
)
continue
new_external_permissions.append(
User__ExternalUserGroupId(
user_id=user_id,

View File

@ -195,6 +195,7 @@ def _fetch_all_page_restrictions_for_space(
confluence_client: OnyxConfluence,
slim_docs: list[SlimDocument],
space_permissions_by_space_key: dict[str, ExternalAccess],
is_cloud: bool,
) -> list[DocExternalAccess]:
"""
For all pages, if a page has restrictions, then use those restrictions.
@ -222,29 +223,50 @@ def _fetch_all_page_restrictions_for_space(
continue
space_key = slim_doc.perm_sync_data.get("space_key")
if space_permissions := space_permissions_by_space_key.get(space_key):
# If there are no restrictions, then use the space's restrictions
document_restrictions.append(
DocExternalAccess(
doc_id=slim_doc.id,
external_access=space_permissions,
)
if not (space_permissions := space_permissions_by_space_key.get(space_key)):
logger.debug(
f"Individually fetching space permissions for space {space_key}"
)
if (
not space_permissions.is_public
and not space_permissions.external_user_emails
and not space_permissions.external_user_group_ids
):
try:
# If the space permissions are not in the cache, then fetch them
if is_cloud:
retrieved_space_permissions = _get_cloud_space_permissions(
confluence_client=confluence_client, space_key=space_key
)
else:
retrieved_space_permissions = _get_server_space_permissions(
confluence_client=confluence_client, space_key=space_key
)
space_permissions_by_space_key[space_key] = retrieved_space_permissions
space_permissions = retrieved_space_permissions
except Exception as e:
logger.warning(
f"Permissions are empty for document: {slim_doc.id}\n"
"This means space permissions are may be wrong for"
f" Space key: {space_key}"
f"Error fetching space permissions for space {space_key}: {e}"
)
if not space_permissions:
logger.warning(
f"No permissions found for document {slim_doc.id} in space {space_key}"
)
continue
logger.warning(
f"No permissions found for document {slim_doc.id} in space {space_key}"
# If there are no restrictions, then use the space's restrictions
document_restrictions.append(
DocExternalAccess(
doc_id=slim_doc.id,
external_access=space_permissions,
)
)
if (
not space_permissions.is_public
and not space_permissions.external_user_emails
and not space_permissions.external_user_group_ids
):
logger.warning(
f"Permissions are empty for document: {slim_doc.id}\n"
"This means space permissions are may be wrong for"
f" Space key: {space_key}"
)
logger.debug("Finished fetching all page restrictions for space")
return document_restrictions
@ -283,4 +305,5 @@ def confluence_doc_sync(
confluence_client=confluence_connector.confluence_client,
slim_docs=slim_docs,
space_permissions_by_space_key=space_permissions_by_space_key,
is_cloud=is_cloud,
)

View File

@ -3,6 +3,8 @@ from collections.abc import Callable
from danswer.access.models import DocExternalAccess
from danswer.configs.constants import DocumentSource
from danswer.db.models import ConnectorCredentialPair
from ee.danswer.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
from ee.danswer.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
from ee.danswer.db.external_perm import ExternalUserGroup
from ee.danswer.external_permissions.confluence.doc_sync import confluence_doc_sync
from ee.danswer.external_permissions.confluence.group_sync import confluence_group_sync
@ -56,7 +58,7 @@ GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC: set[DocumentSource] = {
# If nothing is specified here, we run the doc_sync every time the celery beat runs
DOC_PERMISSION_SYNC_PERIODS: dict[DocumentSource, int] = {
# Polling is not supported so we fetch all doc permissions every 5 minutes
DocumentSource.CONFLUENCE: 5 * 60,
DocumentSource.CONFLUENCE: CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY,
DocumentSource.SLACK: 5 * 60,
}
@ -64,7 +66,7 @@ DOC_PERMISSION_SYNC_PERIODS: dict[DocumentSource, int] = {
EXTERNAL_GROUP_SYNC_PERIODS: dict[DocumentSource, int] = {
# Polling is not supported so we fetch all group permissions every 30 minutes
DocumentSource.GOOGLE_DRIVE: 5 * 60,
DocumentSource.CONFLUENCE: 30 * 60,
DocumentSource.CONFLUENCE: CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY,
}