diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 1869a72b475e..c909fe90219b 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -15,6 +15,7 @@ from danswer.connectors.confluence.utils import attachment_to_content from danswer.connectors.confluence.utils import build_confluence_document_id from danswer.connectors.confluence.utils import datetime_from_string from danswer.connectors.confluence.utils import extract_text_from_confluence_html +from danswer.connectors.confluence.utils import validate_attachment_filetype from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector @@ -276,9 +277,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): ): # If the page has restrictions, add them to the perm_sync_data # These will be used by doc_sync.py to sync permissions - perm_sync_data = { - "restrictions": page.get("restrictions", {}), - "space_key": page.get("space", {}).get("key"), + page_restrictions = page.get("restrictions") + page_space_key = page.get("space", {}).get("key") + page_perm_sync_data = { + "restrictions": page_restrictions or {}, + "space_key": page_space_key, } doc_metadata_list.append( @@ -288,7 +291,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): page["_links"]["webui"], self.is_cloud, ), - perm_sync_data=perm_sync_data, + perm_sync_data=page_perm_sync_data, ) ) attachment_cql = f"type=attachment and container='{page['id']}'" @@ -298,6 +301,21 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): expand=restrictions_expand, limit=_SLIM_DOC_BATCH_SIZE, ): + if not validate_attachment_filetype(attachment): + continue + attachment_restrictions = attachment.get("restrictions") + if not attachment_restrictions: + attachment_restrictions = page_restrictions + + attachment_space_key = attachment.get("space", {}).get("key") + if not attachment_space_key: + attachment_space_key = page_space_key + + attachment_perm_sync_data = { + "restrictions": attachment_restrictions or {}, + "space_key": attachment_space_key, + } + doc_metadata_list.append( SlimDocument( id=build_confluence_document_id( @@ -305,7 +323,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): attachment["_links"]["webui"], self.is_cloud, ), - perm_sync_data=perm_sync_data, + perm_sync_data=attachment_perm_sync_data, ) ) if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE: diff --git a/backend/danswer/connectors/confluence/utils.py b/backend/danswer/connectors/confluence/utils.py index f5511063d88e..991d03e6571e 100644 --- a/backend/danswer/connectors/confluence/utils.py +++ b/backend/danswer/connectors/confluence/utils.py @@ -177,19 +177,23 @@ def extract_text_from_confluence_html( return format_document_soup(soup) -def attachment_to_content( - confluence_client: OnyxConfluence, - attachment: dict[str, Any], -) -> str | None: - """If it returns None, assume that we should skip this attachment.""" - if attachment["metadata"]["mediaType"] in [ +def validate_attachment_filetype(attachment: dict[str, Any]) -> bool: + return attachment["metadata"]["mediaType"] not in [ "image/jpeg", "image/png", "image/gif", "image/svg+xml", "video/mp4", "video/quicktime", - ]: + ] + + +def attachment_to_content( + confluence_client: OnyxConfluence, + attachment: dict[str, Any], +) -> str | None: + """If it returns None, assume that we should skip this attachment.""" + if not validate_attachment_filetype(attachment): return None download_link = confluence_client.url + attachment["_links"]["download"] @@ -245,7 +249,7 @@ def build_confluence_document_id( return f"{base_url}{content_url}" -def extract_referenced_attachment_names(page_text: str) -> list[str]: +def _extract_referenced_attachment_names(page_text: str) -> list[str]: """Parse a Confluence html page to generate a list of current attachments in use diff --git a/backend/ee/danswer/external_permissions/confluence/doc_sync.py b/backend/ee/danswer/external_permissions/confluence/doc_sync.py index 73a4f6c50a74..81ec008d9b51 100644 --- a/backend/ee/danswer/external_permissions/confluence/doc_sync.py +++ b/backend/ee/danswer/external_permissions/confluence/doc_sync.py @@ -242,7 +242,9 @@ def _fetch_all_page_restrictions_for_space( ) continue - logger.warning(f"No permissions found for document {slim_doc.id}") + logger.warning( + f"No permissions found for document {slim_doc.id} in space {space_key}" + ) logger.debug("Finished fetching all page restrictions for space") return document_restrictions diff --git a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py index e5cc74cb1bc2..35d2da61cf11 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_permissions_basic.py @@ -9,6 +9,7 @@ from danswer.connectors.confluence.connector import ConfluenceConnector def confluence_connector() -> ConfluenceConnector: connector = ConfluenceConnector( wiki_base="https://danswerai.atlassian.net", + is_cloud=True, ) connector.load_credentials( {