Added filter to slim connector and logging for space permissions

This commit is contained in:
hagen-danswer 2024-12-06 07:55:54 -08:00
parent c50cd20156
commit 7c6981e052
3 changed files with 20 additions and 15 deletions

View File

@ -13,6 +13,7 @@ from danswer.connectors.confluence.onyx_confluence import build_confluence_clien
from danswer.connectors.confluence.onyx_confluence import OnyxConfluence from danswer.connectors.confluence.onyx_confluence import OnyxConfluence
from danswer.connectors.confluence.utils import attachment_to_content from danswer.connectors.confluence.utils import attachment_to_content
from danswer.connectors.confluence.utils import build_confluence_document_id from danswer.connectors.confluence.utils import build_confluence_document_id
from danswer.connectors.confluence.utils import check_attachment_filetype
from danswer.connectors.confluence.utils import datetime_from_string from danswer.connectors.confluence.utils import datetime_from_string
from danswer.connectors.confluence.utils import extract_text_from_confluence_html from danswer.connectors.confluence.utils import extract_text_from_confluence_html
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
@ -298,6 +299,8 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
expand=restrictions_expand, expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE, limit=_SLIM_DOC_BATCH_SIZE,
): ):
if not check_attachment_filetype(attachment):
continue
doc_metadata_list.append( doc_metadata_list.append(
SlimDocument( SlimDocument(
id=build_confluence_document_id( id=build_confluence_document_id(

View File

@ -177,19 +177,23 @@ def extract_text_from_confluence_html(
return format_document_soup(soup) return format_document_soup(soup)
def attachment_to_content( def check_attachment_filetype(attachment: dict[str, Any]) -> bool:
confluence_client: OnyxConfluence, return attachment["metadata"]["mediaType"] in [
attachment: dict[str, Any],
) -> str | None:
"""If it returns None, assume that we should skip this attachment."""
if attachment["metadata"]["mediaType"] in [
"image/jpeg", "image/jpeg",
"image/png", "image/png",
"image/gif", "image/gif",
"image/svg+xml", "image/svg+xml",
"video/mp4", "video/mp4",
"video/quicktime", "video/quicktime",
]: ]
def attachment_to_content(
confluence_client: OnyxConfluence,
attachment: dict[str, Any],
) -> str | None:
"""If it returns None, assume that we should skip this attachment."""
if not check_attachment_filetype(attachment):
return None return None
download_link = confluence_client.url + attachment["_links"]["download"] download_link = confluence_client.url + attachment["_links"]["download"]
@ -245,7 +249,7 @@ def build_confluence_document_id(
return f"{base_url}{content_url}" return f"{base_url}{content_url}"
def extract_referenced_attachment_names(page_text: str) -> list[str]: def _extract_referenced_attachment_names(page_text: str) -> list[str]:
"""Parse a Confluence html page to generate a list of current """Parse a Confluence html page to generate a list of current
attachments in use attachments in use

View File

@ -95,7 +95,6 @@ def _get_cloud_space_permissions(
def _get_space_permissions( def _get_space_permissions(
confluence_client: OnyxConfluence, confluence_client: OnyxConfluence,
is_cloud: bool,
) -> dict[str, ExternalAccess]: ) -> dict[str, ExternalAccess]:
logger.debug("Getting space permissions") logger.debug("Getting space permissions")
# Gets all the spaces in the Confluence instance # Gets all the spaces in the Confluence instance
@ -117,7 +116,7 @@ def _get_space_permissions(
logger.debug(f"Got {len(all_space_keys)} spaces from confluence") logger.debug(f"Got {len(all_space_keys)} spaces from confluence")
space_permissions_by_space_key: dict[str, ExternalAccess] = {} space_permissions_by_space_key: dict[str, ExternalAccess] = {}
for space_key in all_space_keys: for space_key in all_space_keys:
if is_cloud: if confluence_client.cloud:
space_permissions = _get_cloud_space_permissions( space_permissions = _get_cloud_space_permissions(
confluence_client=confluence_client, space_key=space_key confluence_client=confluence_client, space_key=space_key
) )
@ -242,7 +241,9 @@ def _fetch_all_page_restrictions_for_space(
) )
continue continue
logger.warning(f"No permissions found for document {slim_doc.id}") logger.warning(
f"No permissions found for document {slim_doc.id} in space {space_key}"
)
logger.debug("Finished fetching all page restrictions for space") logger.debug("Finished fetching all page restrictions for space")
return document_restrictions return document_restrictions
@ -263,11 +264,8 @@ def confluence_doc_sync(
) )
confluence_connector.load_credentials(cc_pair.credential.credential_json) confluence_connector.load_credentials(cc_pair.credential.credential_json)
is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
space_permissions_by_space_key = _get_space_permissions( space_permissions_by_space_key = _get_space_permissions(
confluence_client=confluence_connector.confluence_client, confluence_client=confluence_connector.confluence_client
is_cloud=is_cloud,
) )
slim_docs = [] slim_docs = []