Confluence: Add page attachments indexing (#1617)

* Confluence: Add page attachments indexing

* used the centralized file processing to extract file content

* flipped input order for extract_file_text

* added bytes support for pdf converter

* brought out the io.BytesIO to the confluence connector

---------

Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
This commit is contained in:
hagen-danswer
2024-06-11 21:23:13 -04:00
committed by GitHub
parent 8c324f8f01
commit 486b0ecb31

View File

@@ -1,3 +1,5 @@
import io
import os
from collections.abc import Callable
from collections.abc import Collection
from datetime import datetime
@@ -27,6 +29,7 @@ from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import extract_file_text
from danswer.file_processing.html_utils import format_document_soup
from danswer.utils.logger import setup_logger
@@ -147,6 +150,24 @@ def parse_html_page(text: str, confluence_client: Confluence) -> str:
return format_document_soup(soup)
def get_used_attachments(text: str, confluence_client: Confluence) -> list[str]:
"""Parse a Confluence html page to generate a list of current
attachment in used
Args:
text (str): The page content
confluence_client (Confluence): Confluence client
Returns:
list[str]: List of filename currently in used
"""
files_in_used = []
soup = bs4.BeautifulSoup(text, "html.parser")
for attachment in soup.findAll("ri:attachment"):
files_in_used.append(attachment.attrs["ri:filename"])
return files_in_used
def _comment_dfs(
comments_str: str,
comment_pages: Collection[dict[str, Any]],
@@ -321,6 +342,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
logger.exception("Ran into exception when fetching labels from Confluence")
return []
def _fetch_attachments(
self, confluence_client: Confluence, page_id: str, files_in_used: list[str]
) -> str:
get_attachments_from_content = make_confluence_call_handle_rate_limit(
confluence_client.get_attachments_from_content
)
files_attachment_content: list = []
try:
attachments_container = get_attachments_from_content(
page_id, start=0, limit=500
)
for attachment in attachments_container["results"]:
if attachment["metadata"]["mediaType"] in ["image/jpeg", "image/png"]:
continue
if attachment["title"] not in files_in_used:
continue
download_link = confluence_client.url + attachment["_links"]["download"]
response = confluence_client._session.get(download_link)
if response.status_code == 200:
extract = extract_file_text(
attachment["title"], io.BytesIO(response.content)
)
files_attachment_content.append(extract)
except Exception as e:
if not self.continue_on_failure:
raise e
logger.exception(
f"Ran into exception when fetching attachments from Confluence: {e}"
)
return "\n".join(files_attachment_content)
def _get_doc_batch(
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
) -> tuple[list[Document], int]:
@@ -366,6 +424,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
logger.debug("Page is empty, skipping: %s", page_url)
continue
page_text = parse_html_page(page_html, self.confluence_client)
files_in_used = get_used_attachments(page_html, self.confluence_client)
attachment_text = self._fetch_attachments(
self.confluence_client, page_id, files_in_used
)
page_text += attachment_text
comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text
@@ -423,8 +487,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if __name__ == "__main__":
import os
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector.load_credentials(
{