Confluence: Add page attachments indexing (#1617)

* Confluence: Add page attachments indexing * used the centralized file processing to extract file content * flipped input order for extract_file_text * added bytes support for pdf converter * brought out the io.BytesIO to the confluence connector --------- Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
2025-10-10 05:05:34 +02:00 · 2024-06-11 21:23:13 -04:00
parent 8c324f8f01
commit 486b0ecb31
1 changed files with 64 additions and 2 deletions
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,3 +1,5 @@
 import io
 import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@@ -27,6 +29,7 @@ from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.file_processing.extract_file_text import extract_file_text
 from danswer.file_processing.html_utils import format_document_soup
 from danswer.utils.logger import setup_logger
@@ -147,6 +150,24 @@ def parse_html_page(text: str, confluence_client: Confluence) -> str:
    return format_document_soup(soup)
 def get_used_attachments(text: str, confluence_client: Confluence) -> list[str]:
    """Parse a Confluence html page to generate a list of current
        attachment in used
    Args:
        text (str): The page content
        confluence_client (Confluence): Confluence client
    Returns:
        list[str]: List of filename currently in used
    """
    files_in_used = []
    soup = bs4.BeautifulSoup(text, "html.parser")
    for attachment in soup.findAll("ri:attachment"):
        files_in_used.append(attachment.attrs["ri:filename"])
    return files_in_used
 def _comment_dfs(
    comments_str: str,
    comment_pages: Collection[dict[str, Any]],
@@ -321,6 +342,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            logger.exception("Ran into exception when fetching labels from Confluence")
            return []
    def _fetch_attachments(
        self, confluence_client: Confluence, page_id: str, files_in_used: list[str]
    ) -> str:
        get_attachments_from_content = make_confluence_call_handle_rate_limit(
            confluence_client.get_attachments_from_content
        )
        files_attachment_content: list = []
        try:
            attachments_container = get_attachments_from_content(
                page_id, start=0, limit=500
            )
            for attachment in attachments_container["results"]:
                if attachment["metadata"]["mediaType"] in ["image/jpeg", "image/png"]:
                    continue
                if attachment["title"] not in files_in_used:
                    continue
                download_link = confluence_client.url + attachment["_links"]["download"]
                response = confluence_client._session.get(download_link)
                if response.status_code == 200:
                    extract = extract_file_text(
                        attachment["title"], io.BytesIO(response.content)
                    )
                    files_attachment_content.append(extract)
        except Exception as e:
            if not self.continue_on_failure:
                raise e
            logger.exception(
                f"Ran into exception when fetching attachments from Confluence: {e}"
            )
        return "\n".join(files_attachment_content)
    def _get_doc_batch(
        self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
    ) -> tuple[list[Document], int]:
@@ -366,6 +424,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                    logger.debug("Page is empty, skipping: %s", page_url)
                    continue
                page_text = parse_html_page(page_html, self.confluence_client)
                files_in_used = get_used_attachments(page_html, self.confluence_client)
                attachment_text = self._fetch_attachments(
                    self.confluence_client, page_id, files_in_used
                )
                page_text += attachment_text
                comments_text = self._fetch_comments(self.confluence_client, page_id)
                page_text += comments_text
@@ -423,8 +487,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
 if __name__ == "__main__":
    import os
    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
    connector.load_credentials(
        {