Confluence: Add page attachments indexing (#1617)

* Confluence: Add page attachments indexing * used the centralized file processing to extract file content * flipped input order for extract_file_text * added bytes support for pdf converter * brought out the io.BytesIO to the confluence connector --------- Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
2025-10-03 18:08:58 +02:00 · 2024-06-11 21:23:13 -04:00
parent 8c324f8f01
commit 486b0ecb31
1 changed files with 64 additions and 2 deletions
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,3 +1,5 @@
+import io
+import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@@ -27,6 +29,7 @@ from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
+from danswer.file_processing.extract_file_text import extract_file_text
 from danswer.file_processing.html_utils import format_document_soup
 from danswer.utils.logger import setup_logger

@@ -147,6 +150,24 @@ def parse_html_page(text: str, confluence_client: Confluence) -> str:
    return format_document_soup(soup)


+def get_used_attachments(text: str, confluence_client: Confluence) -> list[str]:
+    """Parse a Confluence html page to generate a list of current
+        attachment in used
+
+    Args:
+        text (str): The page content
+        confluence_client (Confluence): Confluence client
+
+    Returns:
+        list[str]: List of filename currently in used
+    """
+    files_in_used = []
+    soup = bs4.BeautifulSoup(text, "html.parser")
+    for attachment in soup.findAll("ri:attachment"):
+        files_in_used.append(attachment.attrs["ri:filename"])
+    return files_in_used
+
+
 def _comment_dfs(
    comments_str: str,
    comment_pages: Collection[dict[str, Any]],
@@ -321,6 +342,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            logger.exception("Ran into exception when fetching labels from Confluence")
            return []

+    def _fetch_attachments(
+        self, confluence_client: Confluence, page_id: str, files_in_used: list[str]
+    ) -> str:
+        get_attachments_from_content = make_confluence_call_handle_rate_limit(
+            confluence_client.get_attachments_from_content
+        )
+        files_attachment_content: list = []
+
+        try:
+            attachments_container = get_attachments_from_content(
+                page_id, start=0, limit=500
+            )
+            for attachment in attachments_container["results"]:
+                if attachment["metadata"]["mediaType"] in ["image/jpeg", "image/png"]:
+                    continue
+
+                if attachment["title"] not in files_in_used:
+                    continue
+
+                download_link = confluence_client.url + attachment["_links"]["download"]
+                response = confluence_client._session.get(download_link)
+
+                if response.status_code == 200:
+                    extract = extract_file_text(
+                        attachment["title"], io.BytesIO(response.content)
+                    )
+                    files_attachment_content.append(extract)
+
+        except Exception as e:
+            if not self.continue_on_failure:
+                raise e
+            logger.exception(
+                f"Ran into exception when fetching attachments from Confluence: {e}"
+            )
+
+        return "\n".join(files_attachment_content)
+
    def _get_doc_batch(
        self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
    ) -> tuple[list[Document], int]:
@@ -366,6 +424,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                    logger.debug("Page is empty, skipping: %s", page_url)
                    continue
                page_text = parse_html_page(page_html, self.confluence_client)
+
+                files_in_used = get_used_attachments(page_html, self.confluence_client)
+                attachment_text = self._fetch_attachments(
+                    self.confluence_client, page_id, files_in_used
+                )
+                page_text += attachment_text
                comments_text = self._fetch_comments(self.confluence_client, page_id)
                page_text += comments_text

@@ -423,8 +487,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):


 if __name__ == "__main__":
-    import os
-
    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
    connector.load_credentials(
        {