diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py index 76f694868..be006d877 100644 --- a/backend/onyx/connectors/confluence/connector.py +++ b/backend/onyx/connectors/confluence/connector.py @@ -65,20 +65,6 @@ _RESTRICTIONS_EXPANSION_FIELDS = [ _SLIM_DOC_BATCH_SIZE = 5000 -_ATTACHMENT_EXTENSIONS_TO_FILTER_OUT = [ - "gif", - "mp4", - "mov", - "mp3", - "wav", -] -_FULL_EXTENSION_FILTER_STRING = "".join( - [ - f" and title!~'*.{extension}'" - for extension in _ATTACHMENT_EXTENSIONS_TO_FILTER_OUT - ] -) - class ConfluenceConnector( LoadConnector, @@ -207,7 +193,6 @@ class ConfluenceConnector( def _construct_attachment_query(self, confluence_page_id: str) -> str: attachment_query = f"type=attachment and container='{confluence_page_id}'" attachment_query += self.cql_label_filter - attachment_query += _FULL_EXTENSION_FILTER_STRING return attachment_query def _get_comment_string_for_page_id(self, page_id: str) -> str: @@ -372,11 +357,13 @@ class ConfluenceConnector( if not validate_attachment_filetype( attachment, ): + logger.info(f"Skipping attachment: {attachment['title']}") continue + logger.info(f"Processing attachment: {attachment['title']}") + # Attempt to get textual content or image summarization: try: - logger.info(f"Processing attachment: {attachment['title']}") response = convert_attachment_to_content( confluence_client=self.confluence_client, attachment=attachment, diff --git a/backend/onyx/file_processing/file_validation.py b/backend/onyx/file_processing/file_validation.py index fa4df5a42..0041fd4a7 100644 --- a/backend/onyx/file_processing/file_validation.py +++ b/backend/onyx/file_processing/file_validation.py @@ -15,6 +15,7 @@ EXCLUDED_IMAGE_TYPES = [ "image/tiff", "image/gif", "image/svg+xml", + "image/avif", ]