From 036648146ddeee3ffb7a0a847d8774b7f7d67c0c Mon Sep 17 00:00:00 2001 From: rkuo-danswer Date: Wed, 26 Mar 2025 17:35:14 -0700 Subject: [PATCH] possible fix for confluence query filter (#4280) * possible fix for confluence query filter * nuke the attachment filter query ... it doesn't work! --------- Co-authored-by: Richard Kuo (Onyx) --- .../onyx/connectors/confluence/connector.py | 19 +++---------------- .../onyx/file_processing/file_validation.py | 1 + 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py index bb9d2d10e1..b17c52fd1e 100644 --- a/backend/onyx/connectors/confluence/connector.py +++ b/backend/onyx/connectors/confluence/connector.py @@ -65,20 +65,6 @@ _RESTRICTIONS_EXPANSION_FIELDS = [ _SLIM_DOC_BATCH_SIZE = 5000 -_ATTACHMENT_EXTENSIONS_TO_FILTER_OUT = [ - "gif", - "mp4", - "mov", - "mp3", - "wav", -] -_FULL_EXTENSION_FILTER_STRING = "".join( - [ - f" and title!~'*.{extension}'" - for extension in _ATTACHMENT_EXTENSIONS_TO_FILTER_OUT - ] -) - ONE_HOUR = 3600 @@ -209,7 +195,6 @@ class ConfluenceConnector( def _construct_attachment_query(self, confluence_page_id: str) -> str: attachment_query = f"type=attachment and container='{confluence_page_id}'" attachment_query += self.cql_label_filter - attachment_query += _FULL_EXTENSION_FILTER_STRING return attachment_query def _get_comment_string_for_page_id(self, page_id: str) -> str: @@ -374,11 +359,13 @@ class ConfluenceConnector( if not validate_attachment_filetype( attachment, ): + logger.info(f"Skipping attachment: {attachment['title']}") continue + logger.info(f"Processing attachment: {attachment['title']}") + # Attempt to get textual content or image summarization: try: - logger.info(f"Processing attachment: {attachment['title']}") response = convert_attachment_to_content( confluence_client=self.confluence_client, attachment=attachment, diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py index fa4df5a429..0041fd4a70 100644 --- a/backend/onyx/file_processing/file_validation.py +++ b/backend/onyx/file_processing/file_validation.py @@ -15,6 +15,7 @@ EXCLUDED_IMAGE_TYPES = [ "image/tiff", "image/gif", "image/svg+xml", + "image/avif", ]