Support Confluence data center + allow for specifying labels to ignore (#624)

2025-10-10 21:26:01 +02:00 · 2023-10-24 17:40:42 -07:00
parent 17bd68be4c
commit ef2b445201
4 changed files with 114 additions and 18 deletions
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -109,9 +109,11 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
 #####
 GOOGLE_DRIVE_INCLUDE_SHARED = False
 GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
 FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
    "FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
 )
 # TODO these should be available for frontend configuration, via advanced options expandable
 WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
    "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
@@ -128,6 +130,14 @@ NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
    == "true"
 )
 CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
    ignored_tag
    for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
        ","
    )
    if ignored_tag
 ]
 #####
 # Query Configs
 #####
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
 from atlassian import Confluence  # type:ignore
 from requests import HTTPError
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
@@ -30,17 +31,12 @@ logger = setup_logger()
 # 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
-def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
+def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str]:
    """Sample
    https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
-    wiki_base is danswer.atlassian.net/wiki
+    wiki_base is https://danswer.atlassian.net/wiki
    space is 1234abcd
    """
    if ".atlassian.net/wiki/spaces/" not in wiki_url:
        raise ValueError(
            "Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
        )
    parsed_url = urlparse(wiki_url)
    wiki_base = (
        parsed_url.scheme
@@ -52,6 +48,42 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
    return wiki_base, space
 def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str]:
    """Sample
    https://danswer.ai/confluence/display/1234abcd/overview
    wiki_base is https://danswer.ai/confluence
    space is 1234abcd
    """
    # /display/ is always right before the space and at the end of the base url
    DISPLAY = "/display/"
    parsed_url = urlparse(wiki_url)
    wiki_base = (
        parsed_url.scheme
        + "://"
        + parsed_url.netloc
        + parsed_url.path.split(DISPLAY)[0]
    )
    space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
    return wiki_base, space
 def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
    is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url
    try:
        if is_confluence_cloud:
            wiki_base, space = _extract_confluence_keys_from_cloud_url(wiki_url)
        else:
            wiki_base, space = _extract_confluence_keys_from_datacenter_url(wiki_url)
    except Exception as e:
        error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base and space names. Exception: {e}"
        logger.error(error_msg)
        raise ValueError(error_msg)
    return wiki_base, space, is_confluence_cloud
 def _comment_dfs(
    comments_str: str,
    comment_pages: Collection[dict[str, Any]],
@@ -79,10 +111,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        wiki_page_url: str,
        batch_size: int = INDEX_BATCH_SIZE,
        continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
        # if a page has one of the labels specified in this list, we will just
        # skip it. This is generally used to avoid indexing extra sensitive
        # pages.
        labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
    ) -> None:
        self.batch_size = batch_size
        self.continue_on_failure = continue_on_failure
-        self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
+        self.labels_to_skip = set(labels_to_skip)
        self.wiki_base, self.space, self.is_cloud = extract_confluence_keys_from_url(
            wiki_page_url
        )
        self.confluence_client: Confluence | None = None
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@@ -90,9 +129,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        access_token = credentials["confluence_access_token"]
        self.confluence_client = Confluence(
            url=self.wiki_base,
-            username=username,
+            # passing in username causes issues for Confluence data center
            username=username if self.is_cloud else None,
            password=access_token,
-            cloud=True,
+            cloud=self.is_cloud,
        )
        return None
@@ -185,6 +225,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            )
            return ""
    def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
        try:
            labels_response = confluence_client.get_page_labels(page_id)
            return [label["name"] for label in labels_response["results"]]
        except Exception as e:
            if not self.continue_on_failure:
                raise e
            logger.exception("Ran into exception when fetching labels from Confluence")
            return []
    def _get_doc_batch(
        self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
    ) -> tuple[list[Document], int]:
@@ -200,6 +251,19 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            last_modified = datetime.fromisoformat(last_modified_str)
            if time_filter is None or time_filter(last_modified):
                page_id = page["id"]
                # check disallowed labels
                if self.labels_to_skip:
                    page_labels = self._fetch_labels(self.confluence_client, page_id)
                    label_intersection = self.labels_to_skip.intersection(page_labels)
                    if label_intersection:
                        logger.info(
                            f"Page with ID '{page_id}' has a label which has been "
                            f"designated as disallowed: {label_intersection}. Skipping."
                        )
                        continue
                page_html = (
                    page["body"]
                    .get("storage", page["body"].get("view", {}))
@@ -212,7 +276,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                page_text = (
                    page.get("title", "") + "\n" + parse_html_page_basic(page_html)
                )
-                comments_text = self._fetch_comments(self.confluence_client, page["id"])
+                comments_text = self._fetch_comments(self.confluence_client, page_id)
                page_text += comments_text
                doc_batch.append(
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -74,8 +74,10 @@ services:
      - API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
      - API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
      - AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
      - CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
      - NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-}
      # Connector Configs
      - CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
      - CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-}
      # Danswer SlackBot Configs
      - DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
      - DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
--- a/web/src/app/admin/connectors/confluence/page.tsx
+++ b/web/src/app/admin/connectors/confluence/page.tsx
@@ -20,17 +20,37 @@ import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsT
 import { usePopup } from "@/components/admin/connectors/Popup";
 import { usePublicCredentials } from "@/lib/hooks";
-// Copied from the `extract_confluence_keys_from_url` function
+const extractSpaceFromCloudUrl = (wikiUrl: string): string => {
 const extractSpaceFromUrl = (wikiUrl: string): string | null => {
  if (!wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
    return null;
  }
  const parsedUrl = new URL(wikiUrl);
  const space = parsedUrl.pathname.split("/")[3];
  return space;
 };
 const extractSpaceFromDataCenterUrl = (wikiUrl: string): string => {
  const DISPLAY = "/display/";
  const parsedUrl = new URL(wikiUrl);
  const spaceSpecificSection = parsedUrl.pathname
    .split(DISPLAY)
    .slice(1)
    .join(DISPLAY);
  const space = spaceSpecificSection.split("/")[0];
  return space;
 };
 // Copied from the `extract_confluence_keys_from_url` function
 const extractSpaceFromUrl = (wikiUrl: string): string | null => {
  try {
    if (wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
      return extractSpaceFromCloudUrl(wikiUrl);
    }
    return extractSpaceFromDataCenterUrl(wikiUrl);
  } catch (e) {
    console.log("Failed to extract space from url", e);
    return null;
  }
 };
 const Main = () => {
  const { popup, setPopup } = usePopup();