diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 1869a72b47..e63c949f7b 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -13,6 +13,7 @@ from danswer.connectors.confluence.onyx_confluence import build_confluence_clien from danswer.connectors.confluence.onyx_confluence import OnyxConfluence from danswer.connectors.confluence.utils import attachment_to_content from danswer.connectors.confluence.utils import build_confluence_document_id +from danswer.connectors.confluence.utils import check_attachment_filetype from danswer.connectors.confluence.utils import datetime_from_string from danswer.connectors.confluence.utils import extract_text_from_confluence_html from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -298,6 +299,8 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): expand=restrictions_expand, limit=_SLIM_DOC_BATCH_SIZE, ): + if not check_attachment_filetype(attachment): + continue doc_metadata_list.append( SlimDocument( id=build_confluence_document_id( diff --git a/backend/danswer/connectors/confluence/utils.py b/backend/danswer/connectors/confluence/utils.py index f5511063d8..a22650ceb0 100644 --- a/backend/danswer/connectors/confluence/utils.py +++ b/backend/danswer/connectors/confluence/utils.py @@ -177,19 +177,23 @@ def extract_text_from_confluence_html( return format_document_soup(soup) -def attachment_to_content( - confluence_client: OnyxConfluence, - attachment: dict[str, Any], -) -> str | None: - """If it returns None, assume that we should skip this attachment.""" - if attachment["metadata"]["mediaType"] in [ +def check_attachment_filetype(attachment: dict[str, Any]) -> bool: + return attachment["metadata"]["mediaType"] in [ "image/jpeg", "image/png", "image/gif", "image/svg+xml", "video/mp4", "video/quicktime", - ]: + ] + + +def attachment_to_content( + confluence_client: OnyxConfluence, + attachment: dict[str, Any], +) -> str | None: + """If it returns None, assume that we should skip this attachment.""" + if not check_attachment_filetype(attachment): return None download_link = confluence_client.url + attachment["_links"]["download"] @@ -245,7 +249,7 @@ def build_confluence_document_id( return f"{base_url}{content_url}" -def extract_referenced_attachment_names(page_text: str) -> list[str]: +def _extract_referenced_attachment_names(page_text: str) -> list[str]: """Parse a Confluence html page to generate a list of current attachments in use diff --git a/backend/ee/danswer/external_permissions/confluence/doc_sync.py b/backend/ee/danswer/external_permissions/confluence/doc_sync.py index 73a4f6c50a..52159410de 100644 --- a/backend/ee/danswer/external_permissions/confluence/doc_sync.py +++ b/backend/ee/danswer/external_permissions/confluence/doc_sync.py @@ -95,7 +95,6 @@ def _get_cloud_space_permissions( def _get_space_permissions( confluence_client: OnyxConfluence, - is_cloud: bool, ) -> dict[str, ExternalAccess]: logger.debug("Getting space permissions") # Gets all the spaces in the Confluence instance @@ -117,7 +116,7 @@ def _get_space_permissions( logger.debug(f"Got {len(all_space_keys)} spaces from confluence") space_permissions_by_space_key: dict[str, ExternalAccess] = {} for space_key in all_space_keys: - if is_cloud: + if confluence_client.cloud: space_permissions = _get_cloud_space_permissions( confluence_client=confluence_client, space_key=space_key ) @@ -242,7 +241,9 @@ def _fetch_all_page_restrictions_for_space( ) continue - logger.warning(f"No permissions found for document {slim_doc.id}") + logger.warning( + f"No permissions found for document {slim_doc.id} in space {space_key}" + ) logger.debug("Finished fetching all page restrictions for space") return document_restrictions @@ -263,11 +264,8 @@ def confluence_doc_sync( ) confluence_connector.load_credentials(cc_pair.credential.credential_json) - is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False) - space_permissions_by_space_key = _get_space_permissions( - confluence_client=confluence_connector.confluence_client, - is_cloud=is_cloud, + confluence_client=confluence_connector.confluence_client ) slim_docs = []