From ef2b4452011c027cab01835e24792a121d32e1c8 Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Tue, 24 Oct 2023 17:40:42 -0700 Subject: [PATCH] Support Confluence data center + allow for specifying labels to ignore (#624) --- backend/danswer/configs/app_configs.py | 10 +++ .../connectors/confluence/connector.py | 86 ++++++++++++++++--- .../docker_compose/docker-compose.dev.yml | 4 +- .../app/admin/connectors/confluence/page.tsx | 32 +++++-- 4 files changed, 114 insertions(+), 18 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 97824541b..dc9be420b 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -109,9 +109,11 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres" ##### GOOGLE_DRIVE_INCLUDE_SHARED = False GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False + FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage" ) + # TODO these should be available for frontend configuration, via advanced options expandable WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer" @@ -128,6 +130,14 @@ NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = ( == "true" ) +CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [ + ignored_tag + for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split( + "," + ) + if ignored_tag +] + ##### # Query Configs ##### diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 634d2259a..d2ebda622 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -9,6 +9,7 @@ from urllib.parse import urlparse from atlassian import Confluence # type:ignore from requests import HTTPError +from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -30,17 +31,12 @@ logger = setup_logger() # 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost -def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]: +def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str]: """Sample https://danswer.atlassian.net/wiki/spaces/1234abcd/overview - wiki_base is danswer.atlassian.net/wiki + wiki_base is https://danswer.atlassian.net/wiki space is 1234abcd """ - if ".atlassian.net/wiki/spaces/" not in wiki_url: - raise ValueError( - "Not a valid Confluence Wiki Link, unable to extract wiki base and space names" - ) - parsed_url = urlparse(wiki_url) wiki_base = ( parsed_url.scheme @@ -52,6 +48,42 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]: return wiki_base, space +def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str]: + """Sample + https://danswer.ai/confluence/display/1234abcd/overview + wiki_base is https://danswer.ai/confluence + space is 1234abcd + """ + # /display/ is always right before the space and at the end of the base url + DISPLAY = "/display/" + + parsed_url = urlparse(wiki_url) + wiki_base = ( + parsed_url.scheme + + "://" + + parsed_url.netloc + + parsed_url.path.split(DISPLAY)[0] + ) + space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0] + return wiki_base, space + + +def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]: + is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url + + try: + if is_confluence_cloud: + wiki_base, space = _extract_confluence_keys_from_cloud_url(wiki_url) + else: + wiki_base, space = _extract_confluence_keys_from_datacenter_url(wiki_url) + except Exception as e: + error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base and space names. Exception: {e}" + logger.error(error_msg) + raise ValueError(error_msg) + + return wiki_base, space, is_confluence_cloud + + def _comment_dfs( comments_str: str, comment_pages: Collection[dict[str, Any]], @@ -79,10 +111,17 @@ class ConfluenceConnector(LoadConnector, PollConnector): wiki_page_url: str, batch_size: int = INDEX_BATCH_SIZE, continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, + # if a page has one of the labels specified in this list, we will just + # skip it. This is generally used to avoid indexing extra sensitive + # pages. + labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP, ) -> None: self.batch_size = batch_size self.continue_on_failure = continue_on_failure - self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url) + self.labels_to_skip = set(labels_to_skip) + self.wiki_base, self.space, self.is_cloud = extract_confluence_keys_from_url( + wiki_page_url + ) self.confluence_client: Confluence | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: @@ -90,9 +129,10 @@ class ConfluenceConnector(LoadConnector, PollConnector): access_token = credentials["confluence_access_token"] self.confluence_client = Confluence( url=self.wiki_base, - username=username, + # passing in username causes issues for Confluence data center + username=username if self.is_cloud else None, password=access_token, - cloud=True, + cloud=self.is_cloud, ) return None @@ -185,6 +225,17 @@ class ConfluenceConnector(LoadConnector, PollConnector): ) return "" + def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]: + try: + labels_response = confluence_client.get_page_labels(page_id) + return [label["name"] for label in labels_response["results"]] + except Exception as e: + if not self.continue_on_failure: + raise e + + logger.exception("Ran into exception when fetching labels from Confluence") + return [] + def _get_doc_batch( self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None ) -> tuple[list[Document], int]: @@ -200,6 +251,19 @@ class ConfluenceConnector(LoadConnector, PollConnector): last_modified = datetime.fromisoformat(last_modified_str) if time_filter is None or time_filter(last_modified): + page_id = page["id"] + + # check disallowed labels + if self.labels_to_skip: + page_labels = self._fetch_labels(self.confluence_client, page_id) + label_intersection = self.labels_to_skip.intersection(page_labels) + if label_intersection: + logger.info( + f"Page with ID '{page_id}' has a label which has been " + f"designated as disallowed: {label_intersection}. Skipping." + ) + continue + page_html = ( page["body"] .get("storage", page["body"].get("view", {})) @@ -212,7 +276,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): page_text = ( page.get("title", "") + "\n" + parse_html_page_basic(page_html) ) - comments_text = self._fetch_comments(self.confluence_client, page["id"]) + comments_text = self._fetch_comments(self.confluence_client, page_id) page_text += comments_text doc_batch.append( diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index f9c24bf53..2f2e12422 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -74,8 +74,10 @@ services: - API_TYPE_OPENAI=${API_TYPE_OPENAI:-} - API_VERSION_OPENAI=${API_VERSION_OPENAI:-} - AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-} - - CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-} - NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-} + # Connector Configs + - CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-} + - CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-} # Danswer SlackBot Configs - DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-} - DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-} diff --git a/web/src/app/admin/connectors/confluence/page.tsx b/web/src/app/admin/connectors/confluence/page.tsx index d309ada1f..2a38b23a9 100644 --- a/web/src/app/admin/connectors/confluence/page.tsx +++ b/web/src/app/admin/connectors/confluence/page.tsx @@ -20,17 +20,37 @@ import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsT import { usePopup } from "@/components/admin/connectors/Popup"; import { usePublicCredentials } from "@/lib/hooks"; -// Copied from the `extract_confluence_keys_from_url` function -const extractSpaceFromUrl = (wikiUrl: string): string | null => { - if (!wikiUrl.includes(".atlassian.net/wiki/spaces/")) { - return null; - } - +const extractSpaceFromCloudUrl = (wikiUrl: string): string => { const parsedUrl = new URL(wikiUrl); const space = parsedUrl.pathname.split("/")[3]; return space; }; +const extractSpaceFromDataCenterUrl = (wikiUrl: string): string => { + const DISPLAY = "/display/"; + + const parsedUrl = new URL(wikiUrl); + const spaceSpecificSection = parsedUrl.pathname + .split(DISPLAY) + .slice(1) + .join(DISPLAY); + const space = spaceSpecificSection.split("/")[0]; + return space; +}; + +// Copied from the `extract_confluence_keys_from_url` function +const extractSpaceFromUrl = (wikiUrl: string): string | null => { + try { + if (wikiUrl.includes(".atlassian.net/wiki/spaces/")) { + return extractSpaceFromCloudUrl(wikiUrl); + } + return extractSpaceFromDataCenterUrl(wikiUrl); + } catch (e) { + console.log("Failed to extract space from url", e); + return null; + } +}; + const Main = () => { const { popup, setPopup } = usePopup();