Merge pull request #3359 from danswer-ai/conf-logging-filter

Added filter to slim connector and logging for space permissions
This commit is contained in:
hagen-danswer
2024-12-06 09:03:07 -08:00
committed by GitHub
4 changed files with 39 additions and 14 deletions

View File

@@ -15,6 +15,7 @@ from danswer.connectors.confluence.utils import attachment_to_content
from danswer.connectors.confluence.utils import build_confluence_document_id from danswer.connectors.confluence.utils import build_confluence_document_id
from danswer.connectors.confluence.utils import datetime_from_string from danswer.connectors.confluence.utils import datetime_from_string
from danswer.connectors.confluence.utils import extract_text_from_confluence_html from danswer.connectors.confluence.utils import extract_text_from_confluence_html
from danswer.connectors.confluence.utils import validate_attachment_filetype
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import GenerateSlimDocumentOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
@@ -276,9 +277,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
): ):
# If the page has restrictions, add them to the perm_sync_data # If the page has restrictions, add them to the perm_sync_data
# These will be used by doc_sync.py to sync permissions # These will be used by doc_sync.py to sync permissions
perm_sync_data = { page_restrictions = page.get("restrictions")
"restrictions": page.get("restrictions", {}), page_space_key = page.get("space", {}).get("key")
"space_key": page.get("space", {}).get("key"), page_perm_sync_data = {
"restrictions": page_restrictions or {},
"space_key": page_space_key,
} }
doc_metadata_list.append( doc_metadata_list.append(
@@ -288,7 +291,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
page["_links"]["webui"], page["_links"]["webui"],
self.is_cloud, self.is_cloud,
), ),
perm_sync_data=perm_sync_data, perm_sync_data=page_perm_sync_data,
) )
) )
attachment_cql = f"type=attachment and container='{page['id']}'" attachment_cql = f"type=attachment and container='{page['id']}'"
@@ -298,6 +301,21 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
expand=restrictions_expand, expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE, limit=_SLIM_DOC_BATCH_SIZE,
): ):
if not validate_attachment_filetype(attachment):
continue
attachment_restrictions = attachment.get("restrictions")
if not attachment_restrictions:
attachment_restrictions = page_restrictions
attachment_space_key = attachment.get("space", {}).get("key")
if not attachment_space_key:
attachment_space_key = page_space_key
attachment_perm_sync_data = {
"restrictions": attachment_restrictions or {},
"space_key": attachment_space_key,
}
doc_metadata_list.append( doc_metadata_list.append(
SlimDocument( SlimDocument(
id=build_confluence_document_id( id=build_confluence_document_id(
@@ -305,7 +323,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
attachment["_links"]["webui"], attachment["_links"]["webui"],
self.is_cloud, self.is_cloud,
), ),
perm_sync_data=perm_sync_data, perm_sync_data=attachment_perm_sync_data,
) )
) )
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE: if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:

View File

@@ -177,19 +177,23 @@ def extract_text_from_confluence_html(
return format_document_soup(soup) return format_document_soup(soup)
def attachment_to_content( def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:
confluence_client: OnyxConfluence, return attachment["metadata"]["mediaType"] not in [
attachment: dict[str, Any],
) -> str | None:
"""If it returns None, assume that we should skip this attachment."""
if attachment["metadata"]["mediaType"] in [
"image/jpeg", "image/jpeg",
"image/png", "image/png",
"image/gif", "image/gif",
"image/svg+xml", "image/svg+xml",
"video/mp4", "video/mp4",
"video/quicktime", "video/quicktime",
]: ]
def attachment_to_content(
confluence_client: OnyxConfluence,
attachment: dict[str, Any],
) -> str | None:
"""If it returns None, assume that we should skip this attachment."""
if not validate_attachment_filetype(attachment):
return None return None
download_link = confluence_client.url + attachment["_links"]["download"] download_link = confluence_client.url + attachment["_links"]["download"]
@@ -245,7 +249,7 @@ def build_confluence_document_id(
return f"{base_url}{content_url}" return f"{base_url}{content_url}"
def extract_referenced_attachment_names(page_text: str) -> list[str]: def _extract_referenced_attachment_names(page_text: str) -> list[str]:
"""Parse a Confluence html page to generate a list of current """Parse a Confluence html page to generate a list of current
attachments in use attachments in use

View File

@@ -242,7 +242,9 @@ def _fetch_all_page_restrictions_for_space(
) )
continue continue
logger.warning(f"No permissions found for document {slim_doc.id}") logger.warning(
f"No permissions found for document {slim_doc.id} in space {space_key}"
)
logger.debug("Finished fetching all page restrictions for space") logger.debug("Finished fetching all page restrictions for space")
return document_restrictions return document_restrictions

View File

@@ -9,6 +9,7 @@ from danswer.connectors.confluence.connector import ConfluenceConnector
def confluence_connector() -> ConfluenceConnector: def confluence_connector() -> ConfluenceConnector:
connector = ConfluenceConnector( connector = ConfluenceConnector(
wiki_base="https://danswerai.atlassian.net", wiki_base="https://danswerai.atlassian.net",
is_cloud=True,
) )
connector.load_credentials( connector.load_credentials(
{ {