mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 05:05:34 +02:00
Confluence: Add page attachments indexing (#1617)
* Confluence: Add page attachments indexing * used the centralized file processing to extract file content * flipped input order for extract_file_text * added bytes support for pdf converter * brought out the io.BytesIO to the confluence connector --------- Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
import io
|
||||||
|
import os
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -27,6 +29,7 @@ from danswer.connectors.models import BasicExpertInfo
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.file_processing.extract_file_text import extract_file_text
|
||||||
from danswer.file_processing.html_utils import format_document_soup
|
from danswer.file_processing.html_utils import format_document_soup
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@@ -147,6 +150,24 @@ def parse_html_page(text: str, confluence_client: Confluence) -> str:
|
|||||||
return format_document_soup(soup)
|
return format_document_soup(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def get_used_attachments(text: str, confluence_client: Confluence) -> list[str]:
|
||||||
|
"""Parse a Confluence html page to generate a list of current
|
||||||
|
attachment in used
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The page content
|
||||||
|
confluence_client (Confluence): Confluence client
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]: List of filename currently in used
|
||||||
|
"""
|
||||||
|
files_in_used = []
|
||||||
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
for attachment in soup.findAll("ri:attachment"):
|
||||||
|
files_in_used.append(attachment.attrs["ri:filename"])
|
||||||
|
return files_in_used
|
||||||
|
|
||||||
|
|
||||||
def _comment_dfs(
|
def _comment_dfs(
|
||||||
comments_str: str,
|
comments_str: str,
|
||||||
comment_pages: Collection[dict[str, Any]],
|
comment_pages: Collection[dict[str, Any]],
|
||||||
@@ -321,6 +342,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
logger.exception("Ran into exception when fetching labels from Confluence")
|
logger.exception("Ran into exception when fetching labels from Confluence")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def _fetch_attachments(
|
||||||
|
self, confluence_client: Confluence, page_id: str, files_in_used: list[str]
|
||||||
|
) -> str:
|
||||||
|
get_attachments_from_content = make_confluence_call_handle_rate_limit(
|
||||||
|
confluence_client.get_attachments_from_content
|
||||||
|
)
|
||||||
|
files_attachment_content: list = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
attachments_container = get_attachments_from_content(
|
||||||
|
page_id, start=0, limit=500
|
||||||
|
)
|
||||||
|
for attachment in attachments_container["results"]:
|
||||||
|
if attachment["metadata"]["mediaType"] in ["image/jpeg", "image/png"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if attachment["title"] not in files_in_used:
|
||||||
|
continue
|
||||||
|
|
||||||
|
download_link = confluence_client.url + attachment["_links"]["download"]
|
||||||
|
response = confluence_client._session.get(download_link)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
extract = extract_file_text(
|
||||||
|
attachment["title"], io.BytesIO(response.content)
|
||||||
|
)
|
||||||
|
files_attachment_content.append(extract)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
raise e
|
||||||
|
logger.exception(
|
||||||
|
f"Ran into exception when fetching attachments from Confluence: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(files_attachment_content)
|
||||||
|
|
||||||
def _get_doc_batch(
|
def _get_doc_batch(
|
||||||
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
||||||
) -> tuple[list[Document], int]:
|
) -> tuple[list[Document], int]:
|
||||||
@@ -366,6 +424,12 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
logger.debug("Page is empty, skipping: %s", page_url)
|
logger.debug("Page is empty, skipping: %s", page_url)
|
||||||
continue
|
continue
|
||||||
page_text = parse_html_page(page_html, self.confluence_client)
|
page_text = parse_html_page(page_html, self.confluence_client)
|
||||||
|
|
||||||
|
files_in_used = get_used_attachments(page_html, self.confluence_client)
|
||||||
|
attachment_text = self._fetch_attachments(
|
||||||
|
self.confluence_client, page_id, files_in_used
|
||||||
|
)
|
||||||
|
page_text += attachment_text
|
||||||
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
||||||
page_text += comments_text
|
page_text += comments_text
|
||||||
|
|
||||||
@@ -423,8 +487,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
|
||||||
|
|
||||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user