mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 12:58:41 +02:00
Merge pull request #3359 from danswer-ai/conf-logging-filter
Added filter to slim connector and logging for space permissions
This commit is contained in:
@@ -15,6 +15,7 @@ from danswer.connectors.confluence.utils import attachment_to_content
|
|||||||
from danswer.connectors.confluence.utils import build_confluence_document_id
|
from danswer.connectors.confluence.utils import build_confluence_document_id
|
||||||
from danswer.connectors.confluence.utils import datetime_from_string
|
from danswer.connectors.confluence.utils import datetime_from_string
|
||||||
from danswer.connectors.confluence.utils import extract_text_from_confluence_html
|
from danswer.connectors.confluence.utils import extract_text_from_confluence_html
|
||||||
|
from danswer.connectors.confluence.utils import validate_attachment_filetype
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@@ -276,9 +277,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
):
|
):
|
||||||
# If the page has restrictions, add them to the perm_sync_data
|
# If the page has restrictions, add them to the perm_sync_data
|
||||||
# These will be used by doc_sync.py to sync permissions
|
# These will be used by doc_sync.py to sync permissions
|
||||||
perm_sync_data = {
|
page_restrictions = page.get("restrictions")
|
||||||
"restrictions": page.get("restrictions", {}),
|
page_space_key = page.get("space", {}).get("key")
|
||||||
"space_key": page.get("space", {}).get("key"),
|
page_perm_sync_data = {
|
||||||
|
"restrictions": page_restrictions or {},
|
||||||
|
"space_key": page_space_key,
|
||||||
}
|
}
|
||||||
|
|
||||||
doc_metadata_list.append(
|
doc_metadata_list.append(
|
||||||
@@ -288,7 +291,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
page["_links"]["webui"],
|
page["_links"]["webui"],
|
||||||
self.is_cloud,
|
self.is_cloud,
|
||||||
),
|
),
|
||||||
perm_sync_data=perm_sync_data,
|
perm_sync_data=page_perm_sync_data,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
attachment_cql = f"type=attachment and container='{page['id']}'"
|
attachment_cql = f"type=attachment and container='{page['id']}'"
|
||||||
@@ -298,6 +301,21 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
expand=restrictions_expand,
|
expand=restrictions_expand,
|
||||||
limit=_SLIM_DOC_BATCH_SIZE,
|
limit=_SLIM_DOC_BATCH_SIZE,
|
||||||
):
|
):
|
||||||
|
if not validate_attachment_filetype(attachment):
|
||||||
|
continue
|
||||||
|
attachment_restrictions = attachment.get("restrictions")
|
||||||
|
if not attachment_restrictions:
|
||||||
|
attachment_restrictions = page_restrictions
|
||||||
|
|
||||||
|
attachment_space_key = attachment.get("space", {}).get("key")
|
||||||
|
if not attachment_space_key:
|
||||||
|
attachment_space_key = page_space_key
|
||||||
|
|
||||||
|
attachment_perm_sync_data = {
|
||||||
|
"restrictions": attachment_restrictions or {},
|
||||||
|
"space_key": attachment_space_key,
|
||||||
|
}
|
||||||
|
|
||||||
doc_metadata_list.append(
|
doc_metadata_list.append(
|
||||||
SlimDocument(
|
SlimDocument(
|
||||||
id=build_confluence_document_id(
|
id=build_confluence_document_id(
|
||||||
@@ -305,7 +323,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
|
|||||||
attachment["_links"]["webui"],
|
attachment["_links"]["webui"],
|
||||||
self.is_cloud,
|
self.is_cloud,
|
||||||
),
|
),
|
||||||
perm_sync_data=perm_sync_data,
|
perm_sync_data=attachment_perm_sync_data,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
|
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
|
||||||
|
@@ -177,19 +177,23 @@ def extract_text_from_confluence_html(
|
|||||||
return format_document_soup(soup)
|
return format_document_soup(soup)
|
||||||
|
|
||||||
|
|
||||||
def attachment_to_content(
|
def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:
|
||||||
confluence_client: OnyxConfluence,
|
return attachment["metadata"]["mediaType"] not in [
|
||||||
attachment: dict[str, Any],
|
|
||||||
) -> str | None:
|
|
||||||
"""If it returns None, assume that we should skip this attachment."""
|
|
||||||
if attachment["metadata"]["mediaType"] in [
|
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
"image/png",
|
"image/png",
|
||||||
"image/gif",
|
"image/gif",
|
||||||
"image/svg+xml",
|
"image/svg+xml",
|
||||||
"video/mp4",
|
"video/mp4",
|
||||||
"video/quicktime",
|
"video/quicktime",
|
||||||
]:
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def attachment_to_content(
|
||||||
|
confluence_client: OnyxConfluence,
|
||||||
|
attachment: dict[str, Any],
|
||||||
|
) -> str | None:
|
||||||
|
"""If it returns None, assume that we should skip this attachment."""
|
||||||
|
if not validate_attachment_filetype(attachment):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
download_link = confluence_client.url + attachment["_links"]["download"]
|
download_link = confluence_client.url + attachment["_links"]["download"]
|
||||||
@@ -245,7 +249,7 @@ def build_confluence_document_id(
|
|||||||
return f"{base_url}{content_url}"
|
return f"{base_url}{content_url}"
|
||||||
|
|
||||||
|
|
||||||
def extract_referenced_attachment_names(page_text: str) -> list[str]:
|
def _extract_referenced_attachment_names(page_text: str) -> list[str]:
|
||||||
"""Parse a Confluence html page to generate a list of current
|
"""Parse a Confluence html page to generate a list of current
|
||||||
attachments in use
|
attachments in use
|
||||||
|
|
||||||
|
@@ -242,7 +242,9 @@ def _fetch_all_page_restrictions_for_space(
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.warning(f"No permissions found for document {slim_doc.id}")
|
logger.warning(
|
||||||
|
f"No permissions found for document {slim_doc.id} in space {space_key}"
|
||||||
|
)
|
||||||
|
|
||||||
logger.debug("Finished fetching all page restrictions for space")
|
logger.debug("Finished fetching all page restrictions for space")
|
||||||
return document_restrictions
|
return document_restrictions
|
||||||
|
@@ -9,6 +9,7 @@ from danswer.connectors.confluence.connector import ConfluenceConnector
|
|||||||
def confluence_connector() -> ConfluenceConnector:
|
def confluence_connector() -> ConfluenceConnector:
|
||||||
connector = ConfluenceConnector(
|
connector = ConfluenceConnector(
|
||||||
wiki_base="https://danswerai.atlassian.net",
|
wiki_base="https://danswerai.atlassian.net",
|
||||||
|
is_cloud=True,
|
||||||
)
|
)
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user