Replace user id by the user display name in the exported Confluence page (#815)

Co-authored-by: Matthieu Boret <matthieu.boret@fr.clara.net>
This commit is contained in:
mattboret
2023-12-22 02:52:28 +01:00
committed by GitHub
parent 56406a0b53
commit 4d950aa60d

View File

@@ -2,10 +2,12 @@ from collections.abc import Callable
from collections.abc import Collection from collections.abc import Collection
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from functools import lru_cache
from typing import Any from typing import Any
from typing import cast from typing import cast
from urllib.parse import urlparse from urllib.parse import urlparse
import bs4
from atlassian import Confluence # type:ignore from atlassian import Confluence # type:ignore
from requests import HTTPError from requests import HTTPError
@@ -13,7 +15,7 @@ from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
@@ -85,6 +87,52 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
return wiki_base, space, is_confluence_cloud return wiki_base, space, is_confluence_cloud
@lru_cache()
def _get_user(user_id: str, confluence_client: Confluence) -> str:
"""Get Confluence Display Name based on the account-id or userkey value
Args:
user_id (str): The user id (i.e: the account-id or userkey)
confluence_client (Confluence): The Confluence Client
Returns:
str: The User Display Name. 'Unknown User' if the user is deactivated or not found
"""
user_not_found = "Unknown User"
try:
return confluence_client.get_user_details_by_accountid(user_id).get(
"displayName", user_not_found
)
except Exception as e:
logger.warning(
f"Unable to get the User Display Name with the id: '{user_id}' - {e}"
)
return user_not_found
def parse_html_page(text: str, confluence_client: Confluence) -> str:
"""Parse a Confluence html page and replace the 'user Id' by the real
User Display Name
Args:
text (str): The page content
confluence_client (Confluence): Confluence client
Returns:
str: loaded and formated Confluence page
"""
soup = bs4.BeautifulSoup(text, "html.parser")
for user in soup.findAll("ri:user"):
user_id = (
user.attrs["ri:account-id"]
if "ri:account-id" in user.attrs
else user.attrs["ri:userkey"]
)
user.replaceWith(_get_user(user_id, confluence_client))
return format_document_soup(soup)
def _comment_dfs( def _comment_dfs(
comments_str: str, comments_str: str,
comment_pages: Collection[dict[str, Any]], comment_pages: Collection[dict[str, Any]],
@@ -92,7 +140,9 @@ def _comment_dfs(
) -> str: ) -> str:
for comment_page in comment_pages: for comment_page in comment_pages:
comment_html = comment_page["body"]["storage"]["value"] comment_html = comment_page["body"]["storage"]["value"]
comments_str += "\nComment:\n" + parse_html_page_basic(comment_html) comments_str += "\nComment:\n" + parse_html_page(
comment_html, confluence_client
)
child_comment_pages = confluence_client.get_page_child_by_type( child_comment_pages = confluence_client.get_page_child_by_type(
comment_page["id"], comment_page["id"],
type="comment", type="comment",
@@ -283,7 +333,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
logger.debug("Page is empty, skipping: %s", page_url) logger.debug("Page is empty, skipping: %s", page_url)
continue continue
page_text = ( page_text = (
page.get("title", "") + "\n" + parse_html_page_basic(page_html) page.get("title", "")
+ "\n"
+ parse_html_page(page_html, self.confluence_client)
) )
comments_text = self._fetch_comments(self.confluence_client, page_id) comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text page_text += comments_text