diff --git a/backend/onyx/connectors/confluence/connector.py b/backend/onyx/connectors/confluence/connector.py index 5f9faddc2..05b2692a2 100644 --- a/backend/onyx/connectors/confluence/connector.py +++ b/backend/onyx/connectors/confluence/connector.py @@ -263,6 +263,7 @@ class ConfluenceConnector( result = process_attachment( self.confluence_client, attachment, + page_id, page_title, self.image_analysis_llm, ) @@ -366,6 +367,7 @@ class ConfluenceConnector( response = convert_attachment_to_content( confluence_client=self.confluence_client, attachment=attachment, + page_id=page["id"], page_context=confluence_xml, llm=self.image_analysis_llm, ) diff --git a/backend/onyx/connectors/confluence/onyx_confluence.py b/backend/onyx/connectors/confluence/onyx_confluence.py index 427866a78..aabe43194 100644 --- a/backend/onyx/connectors/confluence/onyx_confluence.py +++ b/backend/onyx/connectors/confluence/onyx_confluence.py @@ -1,4 +1,3 @@ -import io import json import time from collections.abc import Callable @@ -19,17 +18,11 @@ from requests import HTTPError from ee.onyx.configs.app_configs import OAUTH_CONFLUENCE_CLOUD_CLIENT_ID from ee.onyx.configs.app_configs import OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET -from onyx.configs.app_configs import ( - CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD, -) -from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD from onyx.connectors.confluence.utils import _handle_http_error from onyx.connectors.confluence.utils import confluence_refresh_tokens from onyx.connectors.confluence.utils import get_start_param_from_url from onyx.connectors.confluence.utils import update_param_in_path -from onyx.connectors.confluence.utils import validate_attachment_filetype from onyx.connectors.interfaces import CredentialsProviderInterface -from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.html_utils import format_document_soup from onyx.redis.redis_pool import get_redis_client from onyx.utils.logger import setup_logger @@ -808,65 +801,6 @@ def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str: return _USER_ID_TO_DISPLAY_NAME_CACHE.get(user_id) or _USER_NOT_FOUND -def attachment_to_content( - confluence_client: OnyxConfluence, - attachment: dict[str, Any], - parent_content_id: str | None = None, -) -> str | None: - """If it returns None, assume that we should skip this attachment.""" - if not validate_attachment_filetype(attachment): - return None - - if "api.atlassian.com" in confluence_client.url: - # https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content---attachments/#api-wiki-rest-api-content-id-child-attachment-attachmentid-download-get - if not parent_content_id: - logger.warning( - "parent_content_id is required to download attachments from Confluence Cloud!" - ) - return None - - download_link = ( - confluence_client.url - + f"/rest/api/content/{parent_content_id}/child/attachment/{attachment['id']}/download" - ) - else: - download_link = confluence_client.url + attachment["_links"]["download"] - - attachment_size = attachment["extensions"]["fileSize"] - if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: - logger.warning( - f"Skipping {download_link} due to size. " - f"size={attachment_size} " - f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}" - ) - return None - - logger.info(f"_attachment_to_content - _session.get: link={download_link}") - - # why are we using session.get here? we probably won't retry these ... is that ok? - response = confluence_client._session.get(download_link) - if response.status_code != 200: - logger.warning( - f"Failed to fetch {download_link} with invalid status code {response.status_code}" - ) - return None - - extracted_text = extract_file_text( - io.BytesIO(response.content), - file_name=attachment["title"], - break_on_unprocessable=False, - ) - if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD: - logger.warning( - f"Skipping {download_link} due to char count. " - f"char count={len(extracted_text)} " - f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD}" - ) - return None - - return extracted_text - - def extract_text_from_confluence_html( confluence_client: OnyxConfluence, confluence_object: dict[str, Any], diff --git a/backend/onyx/connectors/confluence/utils.py b/backend/onyx/connectors/confluence/utils.py index 9bf1c82d0..d319f12ce 100644 --- a/backend/onyx/connectors/confluence/utils.py +++ b/backend/onyx/connectors/confluence/utils.py @@ -22,6 +22,7 @@ from sqlalchemy.orm import Session from onyx.configs.app_configs import ( CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD, ) +from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD from onyx.configs.constants import FileOrigin if TYPE_CHECKING: @@ -84,25 +85,35 @@ class AttachmentProcessingResult(BaseModel): error: str | None = None -def _download_attachment( - confluence_client: "OnyxConfluence", attachment: dict[str, Any] -) -> bytes | None: - """ - Retrieves the raw bytes of an attachment from Confluence. Returns None on error. - """ - download_link = confluence_client.url + attachment["_links"]["download"] - resp = confluence_client._session.get(download_link) - if resp.status_code != 200: - logger.warning( - f"Failed to fetch {download_link} with status code {resp.status_code}" +def _make_attachment_link( + confluence_client: "OnyxConfluence", + attachment: dict[str, Any], + parent_content_id: str | None = None, +) -> str | None: + download_link = "" + + if "api.atlassian.com" in confluence_client.url: + # https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content---attachments/#api-wiki-rest-api-content-id-child-attachment-attachmentid-download-get + if not parent_content_id: + logger.warning( + "parent_content_id is required to download attachments from Confluence Cloud!" + ) + return None + + download_link = ( + confluence_client.url + + f"/rest/api/content/{parent_content_id}/child/attachment/{attachment['id']}/download" ) - return None - return resp.content + else: + download_link = confluence_client.url + attachment["_links"]["download"] + + return download_link def process_attachment( confluence_client: "OnyxConfluence", attachment: dict[str, Any], + parent_content_id: str | None, page_context: str, llm: LLM | None, ) -> AttachmentProcessingResult: @@ -122,11 +133,52 @@ def process_attachment( error=f"Unsupported file type: {media_type}", ) - # Download the attachment - raw_bytes = _download_attachment(confluence_client, attachment) - if raw_bytes is None: + attachment_link = _make_attachment_link( + confluence_client, attachment, parent_content_id + ) + if not attachment_link: return AttachmentProcessingResult( - text=None, file_name=None, error="Failed to download attachment" + text=None, file_name=None, error="Failed to make attachment link" + ) + + attachment_size = attachment["extensions"]["fileSize"] + + if not media_type.startswith("image/") or not llm: + if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: + logger.warning( + f"Skipping {attachment_link} due to size. " + f"size={attachment_size} " + f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}" + ) + return AttachmentProcessingResult( + text=None, + file_name=None, + error=f"Attachment text too long: {attachment_size} chars", + ) + + logger.info( + f"Downloading attachment: " + f"title={attachment['title']} " + f"length={attachment_size} " + f"link={attachment_link}" + ) + + # Download the attachment + resp: requests.Response = confluence_client._session.get(attachment_link) + if resp.status_code != 200: + logger.warning( + f"Failed to fetch {attachment_link} with status code {resp.status_code}" + ) + return AttachmentProcessingResult( + text=None, + file_name=None, + error=f"Attachment download status code is {resp.status_code}", + ) + + raw_bytes = resp.content + if not raw_bytes: + return AttachmentProcessingResult( + text=None, file_name=None, error="attachment.content is None" ) # Process image attachments with LLM if available @@ -249,6 +301,7 @@ def _process_text_attachment( def convert_attachment_to_content( confluence_client: "OnyxConfluence", attachment: dict[str, Any], + page_id: str, page_context: str, llm: LLM | None, ) -> tuple[str | None, str | None] | None: @@ -266,7 +319,9 @@ def convert_attachment_to_content( ) return None - result = process_attachment(confluence_client, attachment, page_context, llm) + result = process_attachment( + confluence_client, attachment, page_id, page_context, llm + ) if result.error is not None: logger.warning( f"Attachment {attachment['title']} encountered error: {result.error}"