More explicit Confluence Connector (#2289)

2025-09-19 20:24:32 +02:00 · 2024-09-01 20:35:29 -07:00
parent f871b4c6eb
commit c122be2f6a
6 changed files with 236 additions and 98 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -10,6 +10,9 @@ on:
 env:
  # Confluence
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}

--- a/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
+++ b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
@@ -0,0 +1,158 @@
+"""migration confluence to be explicit
+
+Revision ID: a3795dce87be
+Revises: 1f60f60c3401
+Create Date: 2024-09-01 13:52:12.006740
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.sql import table, column
+
+revision = "a3795dce87be"
+down_revision = "1f60f60c3401"
+branch_labels = None
+depends_on = None
+
+
+def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
+    from urllib.parse import urlparse
+
+    def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
+        path_parts = parsed_url.path.split("/")
+        space = path_parts[3]
+        page_id = path_parts[5] if len(path_parts) > 5 else ""
+        return wiki_base, space, page_id
+
+    def _extract_confluence_keys_from_datacenter_url(
+        wiki_url: str,
+    ) -> tuple[str, str, str]:
+        DISPLAY = "/display/"
+        PAGE = "/pages/"
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
+        space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
+        page_id = ""
+        if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
+            page_id = content[1]
+        return wiki_base, space, page_id
+
+    is_confluence_cloud = (
+        ".atlassian.net/wiki/spaces/" in wiki_url
+        or ".jira.com/wiki/spaces/" in wiki_url
+    )
+
+    if is_confluence_cloud:
+        wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
+    else:
+        wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
+            wiki_url
+        )
+
+    return wiki_base, space, page_id, is_confluence_cloud
+
+
+def reconstruct_confluence_url(
+    wiki_base: str, space: str, page_id: str, is_cloud: bool
+) -> str:
+    if is_cloud:
+        url = f"{wiki_base}/spaces/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    else:
+        url = f"{wiki_base}/display/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    return url
+
+
+def upgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    # Fetch all Confluence connectors
+    connection = op.get_bind()
+    confluence_connectors = connection.execute(
+        sa.select(connector).where(
+            sa.and_(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+    ).fetchall()
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        wiki_page_url = config["wiki_page_url"]
+        wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
+            wiki_page_url
+        )
+
+        new_config = {
+            "wiki_base": wiki_base,
+            "space": space,
+            "page_id": page_id,
+            "is_cloud": is_cloud,
+        }
+
+        for key, value in config.items():
+            if key not in ["wiki_page_url"]:
+                new_config[key] = value
+
+        op.execute(
+            connector.update()
+            .where(connector.c.id == row.id)
+            .values(connector_specific_config=new_config)
+        )
+
+
+def downgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    confluence_connectors = (
+        op.get_bind()
+        .execute(
+            sa.select(connector).where(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+        .fetchall()
+    )
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
+            wiki_page_url = reconstruct_confluence_url(
+                config["wiki_base"],
+                config["space"],
+                config.get("page_id", ""),
+                config["is_cloud"],
+            )
+
+            new_config = {"wiki_page_url": wiki_page_url}
+            new_config.update(
+                {
+                    k: v
+                    for k, v in config.items()
+                    if k not in ["wiki_base", "space", "page_id", "is_cloud"]
+                }
+            )
+
+            op.execute(
+                connector.update()
+                .where(connector.c.id == row.id)
+                .values(connector_specific_config=new_config)
+            )
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -7,7 +7,6 @@ from datetime import timezone
 from functools import lru_cache
 from typing import Any
 from typing import cast
-from urllib.parse import urlparse

 import bs4
 from atlassian import Confluence  # type:ignore
@@ -53,79 +52,6 @@ NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
 )


-def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
-    """Sample
-    URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
-    URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
-
-    wiki_base is https://danswer.atlassian.net/wiki
-    space is 1234abcd
-    page_id is 5678efgh
-    """
-    parsed_url = urlparse(wiki_url)
-    wiki_base = (
-        parsed_url.scheme
-        + "://"
-        + parsed_url.netloc
-        + parsed_url.path.split("/spaces")[0]
-    )
-
-    path_parts = parsed_url.path.split("/")
-    space = path_parts[3]
-
-    page_id = path_parts[5] if len(path_parts) > 5 else ""
-    return wiki_base, space, page_id
-
-
-def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
-    """Sample
-    URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
-    URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
-    wiki_base is https://danswer.ai/confluence
-    space is 1234abcd
-    page_id is 5678efgh
-    """
-    # /display/ is always right before the space and at the end of the base print()
-    DISPLAY = "/display/"
-    PAGE = "/pages/"
-
-    parsed_url = urlparse(wiki_url)
-    wiki_base = (
-        parsed_url.scheme
-        + "://"
-        + parsed_url.netloc
-        + parsed_url.path.split(DISPLAY)[0]
-    )
-    space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
-    page_id = ""
-    if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
-        page_id = content[1]
-    return wiki_base, space, page_id
-
-
-def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
-    is_confluence_cloud = (
-        ".atlassian.net/wiki/spaces/" in wiki_url
-        or ".jira.com/wiki/spaces/" in wiki_url
-    )
-
-    try:
-        if is_confluence_cloud:
-            wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
-                wiki_url
-            )
-        else:
-            wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
-                wiki_url
-            )
-    except Exception as e:
-        error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
-        logger.error(error_msg)
-        raise ValueError(error_msg)
-
-    return wiki_base, space, page_id, is_confluence_cloud
-
-
@lru_cache()
 def _get_user(user_id: str, confluence_client: Confluence) -> str:
    """Get Confluence Display Name based on the account-id or userkey value
@@ -372,7 +298,10 @@ class RecursiveIndexer:
 class ConfluenceConnector(LoadConnector, PollConnector):
    def __init__(
        self,
-        wiki_page_url: str,
+        wiki_base: str,
+        space: str,
+        is_cloud: bool,
+        page_id: str = "",
        index_recursively: bool = True,
        batch_size: int = INDEX_BATCH_SIZE,
        continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
@@ -386,15 +315,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        self.labels_to_skip = set(labels_to_skip)
        self.recursive_indexer: RecursiveIndexer | None = None
        self.index_recursively = index_recursively
-        (
-            self.wiki_base,
-            self.space,
-            self.page_id,
-            self.is_cloud,
-        ) = extract_confluence_keys_from_url(wiki_page_url)
+
+        # Remove trailing slash from wiki_base if present
+        self.wiki_base = wiki_base.rstrip("/")
+        self.space = space
+        self.page_id = page_id
+
+        self.is_cloud = is_cloud

        self.space_level_scan = False
-
        self.confluence_client: Confluence | None = None

        if self.page_id is None or self.page_id == "":
@@ -414,7 +343,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            username=username if self.is_cloud else None,
            password=access_token if self.is_cloud else None,
            token=access_token if not self.is_cloud else None,
-            cloud=self.is_cloud,
        )
        return None

@@ -866,7 +794,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):


 if __name__ == "__main__":
-    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector = ConfluenceConnector(
+        wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
+        space=os.environ["CONFLUENCE_TEST_SPACE"],
+        is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
+        page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
+        index_recursively=True,
+    )
    connector.load_credentials(
        {
            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
--- a/backend/tests/daily/connectors/confluence/test_confluence_basic.py
+++ b/backend/tests/daily/connectors/confluence/test_confluence_basic.py
@@ -8,7 +8,13 @@ from danswer.connectors.confluence.connector import ConfluenceConnector

@pytest.fixture
 def confluence_connector() -> ConfluenceConnector:
-    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector = ConfluenceConnector(
+        wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
+        space=os.environ["CONFLUENCE_TEST_SPACE"],
+        is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
+        page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
+    )
+
    connector.load_credentials(
        {
            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
--- a/web/src/components/admin/connectors/ConnectorTitle.tsx
+++ b/web/src/components/admin/connectors/ConnectorTitle.tsx
@@ -48,10 +48,16 @@ export const ConnectorTitle = ({
    );
  } else if (connector.source === "confluence") {
    const typedConnector = connector as Connector<ConfluenceConfig>;
-    additionalMetadata.set(
-      "Wiki URL",
-      typedConnector.connector_specific_config.wiki_page_url
-    );
+    const wikiUrl = typedConnector.connector_specific_config.is_cloud
+      ? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
+      : `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
+    additionalMetadata.set("Wiki URL", wikiUrl);
+    if (typedConnector.connector_specific_config.page_id) {
+      additionalMetadata.set(
+        "Page ID",
+        typedConnector.connector_specific_config.page_id
+      );
+    }
  } else if (connector.source === "jira") {
    const typedConnector = connector as Connector<JiraConfig>;
    additionalMetadata.set(
--- a/web/src/lib/connectors/connectors.ts
+++ b/web/src/lib/connectors/connectors.ts
@@ -219,19 +219,37 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
  },
  confluence: {
    description: "Configure Confluence connector",
-    subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL. 
-    
-For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children). 
+    subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed.

-Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.`,
+For example, entering "https://pablosfsanchez.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https://pablosfsanchez.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space.
+
+Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`,
    values: [
      {
        type: "text",
-        query: "Enter the wiki page URL:",
-        label: "Wiki Page URL",
-        name: "wiki_page_url",
+        query: "Enter the wiki base URL:",
+        label: "Wiki Base URL",
+        name: "wiki_base",
        optional: false,
-        description: "Enter any link to a Confluence space or Page",
+        description:
+          "The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)",
+      },
+      {
+        type: "text",
+        query: "Enter the space:",
+        label: "Space",
+        name: "space",
+        optional: false,
+        description: "The Confluence space name to index (e.g. `KB`)",
+      },
+      {
+        type: "text",
+        query: "Enter the page ID (optional):",
+        label: "Page ID",
+        name: "page_id",
+        optional: true,
+        description:
+          "Specific page ID to index  - leave empty to index the entire space (e.g. `131368`)",
      },
      {
        type: "checkbox",
@@ -241,6 +259,16 @@ Selecting the "Index Recursively" checkbox will index the single page's children
        name: "index_recursively",
        optional: false,
      },
+      {
+        type: "checkbox",
+        query: "Is this a Confluence Cloud instance?",
+        label: "Is Cloud",
+        name: "is_cloud",
+        optional: false,
+        default: true,
+        description:
+          "Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
+      },
    ],
  },
  jira: {
@@ -817,7 +845,10 @@ export interface GmailConfig {}
 export interface BookstackConfig {}

 export interface ConfluenceConfig {
-  wiki_page_url: string;
+  wiki_base: string;
+  space: string;
+  page_id?: string;
+  is_cloud?: boolean;
  index_recursively?: boolean;
 }