More explicit Confluence Connector (#2289)

This commit is contained in:
pablodanswer 2024-09-01 20:35:29 -07:00 committed by GitHub
parent f871b4c6eb
commit c122be2f6a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 236 additions and 98 deletions

View File

@ -10,6 +10,9 @@ on:
env:
# Confluence
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}

View File

@ -0,0 +1,158 @@
"""migration confluence to be explicit
Revision ID: a3795dce87be
Revises: 1f60f60c3401
Create Date: 2024-09-01 13:52:12.006740
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from sqlalchemy.sql import table, column
revision = "a3795dce87be"
down_revision = "1f60f60c3401"
branch_labels = None
depends_on = None
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
from urllib.parse import urlparse
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
parsed_url = urlparse(wiki_url)
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
path_parts = parsed_url.path.split("/")
space = path_parts[3]
page_id = path_parts[5] if len(path_parts) > 5 else ""
return wiki_base, space, page_id
def _extract_confluence_keys_from_datacenter_url(
wiki_url: str,
) -> tuple[str, str, str]:
DISPLAY = "/display/"
PAGE = "/pages/"
parsed_url = urlparse(wiki_url)
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
page_id = ""
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
page_id = content[1]
return wiki_base, space, page_id
is_confluence_cloud = (
".atlassian.net/wiki/spaces/" in wiki_url
or ".jira.com/wiki/spaces/" in wiki_url
)
if is_confluence_cloud:
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
else:
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
wiki_url
)
return wiki_base, space, page_id, is_confluence_cloud
def reconstruct_confluence_url(
wiki_base: str, space: str, page_id: str, is_cloud: bool
) -> str:
if is_cloud:
url = f"{wiki_base}/spaces/{space}"
if page_id:
url += f"/pages/{page_id}"
else:
url = f"{wiki_base}/display/{space}"
if page_id:
url += f"/pages/{page_id}"
return url
def upgrade() -> None:
connector = table(
"connector",
column("id", sa.Integer),
column("source", sa.String()),
column("input_type", sa.String()),
column("connector_specific_config", postgresql.JSONB),
)
# Fetch all Confluence connectors
connection = op.get_bind()
confluence_connectors = connection.execute(
sa.select(connector).where(
sa.and_(
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
)
)
).fetchall()
for row in confluence_connectors:
config = row.connector_specific_config
wiki_page_url = config["wiki_page_url"]
wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
wiki_page_url
)
new_config = {
"wiki_base": wiki_base,
"space": space,
"page_id": page_id,
"is_cloud": is_cloud,
}
for key, value in config.items():
if key not in ["wiki_page_url"]:
new_config[key] = value
op.execute(
connector.update()
.where(connector.c.id == row.id)
.values(connector_specific_config=new_config)
)
def downgrade() -> None:
connector = table(
"connector",
column("id", sa.Integer),
column("source", sa.String()),
column("input_type", sa.String()),
column("connector_specific_config", postgresql.JSONB),
)
confluence_connectors = (
op.get_bind()
.execute(
sa.select(connector).where(
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
)
)
.fetchall()
)
for row in confluence_connectors:
config = row.connector_specific_config
if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
wiki_page_url = reconstruct_confluence_url(
config["wiki_base"],
config["space"],
config.get("page_id", ""),
config["is_cloud"],
)
new_config = {"wiki_page_url": wiki_page_url}
new_config.update(
{
k: v
for k, v in config.items()
if k not in ["wiki_base", "space", "page_id", "is_cloud"]
}
)
op.execute(
connector.update()
.where(connector.c.id == row.id)
.values(connector_specific_config=new_config)
)

View File

@ -7,7 +7,6 @@ from datetime import timezone
from functools import lru_cache
from typing import Any
from typing import cast
from urllib.parse import urlparse
import bs4
from atlassian import Confluence # type:ignore
@ -53,79 +52,6 @@ NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
)
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
"""Sample
URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
wiki_base is https://danswer.atlassian.net/wiki
space is 1234abcd
page_id is 5678efgh
"""
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split("/spaces")[0]
)
path_parts = parsed_url.path.split("/")
space = path_parts[3]
page_id = path_parts[5] if len(path_parts) > 5 else ""
return wiki_base, space, page_id
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
"""Sample
URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
wiki_base is https://danswer.ai/confluence
space is 1234abcd
page_id is 5678efgh
"""
# /display/ is always right before the space and at the end of the base print()
DISPLAY = "/display/"
PAGE = "/pages/"
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split(DISPLAY)[0]
)
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
page_id = ""
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
page_id = content[1]
return wiki_base, space, page_id
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
is_confluence_cloud = (
".atlassian.net/wiki/spaces/" in wiki_url
or ".jira.com/wiki/spaces/" in wiki_url
)
try:
if is_confluence_cloud:
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
wiki_url
)
else:
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
wiki_url
)
except Exception as e:
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
logger.error(error_msg)
raise ValueError(error_msg)
return wiki_base, space, page_id, is_confluence_cloud
@lru_cache()
def _get_user(user_id: str, confluence_client: Confluence) -> str:
"""Get Confluence Display Name based on the account-id or userkey value
@ -372,7 +298,10 @@ class RecursiveIndexer:
class ConfluenceConnector(LoadConnector, PollConnector):
def __init__(
self,
wiki_page_url: str,
wiki_base: str,
space: str,
is_cloud: bool,
page_id: str = "",
index_recursively: bool = True,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
@ -386,15 +315,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
self.labels_to_skip = set(labels_to_skip)
self.recursive_indexer: RecursiveIndexer | None = None
self.index_recursively = index_recursively
(
self.wiki_base,
self.space,
self.page_id,
self.is_cloud,
) = extract_confluence_keys_from_url(wiki_page_url)
# Remove trailing slash from wiki_base if present
self.wiki_base = wiki_base.rstrip("/")
self.space = space
self.page_id = page_id
self.is_cloud = is_cloud
self.space_level_scan = False
self.confluence_client: Confluence | None = None
if self.page_id is None or self.page_id == "":
@ -414,7 +343,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
username=username if self.is_cloud else None,
password=access_token if self.is_cloud else None,
token=access_token if not self.is_cloud else None,
cloud=self.is_cloud,
)
return None
@ -866,7 +794,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if __name__ == "__main__":
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector = ConfluenceConnector(
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
space=os.environ["CONFLUENCE_TEST_SPACE"],
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
index_recursively=True,
)
connector.load_credentials(
{
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],

View File

@ -8,7 +8,13 @@ from danswer.connectors.confluence.connector import ConfluenceConnector
@pytest.fixture
def confluence_connector() -> ConfluenceConnector:
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector = ConfluenceConnector(
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
space=os.environ["CONFLUENCE_TEST_SPACE"],
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
)
connector.load_credentials(
{
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],

View File

@ -48,10 +48,16 @@ export const ConnectorTitle = ({
);
} else if (connector.source === "confluence") {
const typedConnector = connector as Connector<ConfluenceConfig>;
additionalMetadata.set(
"Wiki URL",
typedConnector.connector_specific_config.wiki_page_url
);
const wikiUrl = typedConnector.connector_specific_config.is_cloud
? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
: `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
additionalMetadata.set("Wiki URL", wikiUrl);
if (typedConnector.connector_specific_config.page_id) {
additionalMetadata.set(
"Page ID",
typedConnector.connector_specific_config.page_id
);
}
} else if (connector.source === "jira") {
const typedConnector = connector as Connector<JiraConfig>;
additionalMetadata.set(

View File

@ -219,19 +219,37 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
},
confluence: {
description: "Configure Confluence connector",
subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL.
For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children).
subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed.
Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.`,
For example, entering "https://pablosfsanchez.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https://pablosfsanchez.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space.
Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`,
values: [
{
type: "text",
query: "Enter the wiki page URL:",
label: "Wiki Page URL",
name: "wiki_page_url",
query: "Enter the wiki base URL:",
label: "Wiki Base URL",
name: "wiki_base",
optional: false,
description: "Enter any link to a Confluence space or Page",
description:
"The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)",
},
{
type: "text",
query: "Enter the space:",
label: "Space",
name: "space",
optional: false,
description: "The Confluence space name to index (e.g. `KB`)",
},
{
type: "text",
query: "Enter the page ID (optional):",
label: "Page ID",
name: "page_id",
optional: true,
description:
"Specific page ID to index - leave empty to index the entire space (e.g. `131368`)",
},
{
type: "checkbox",
@ -241,6 +259,16 @@ Selecting the "Index Recursively" checkbox will index the single page's children
name: "index_recursively",
optional: false,
},
{
type: "checkbox",
query: "Is this a Confluence Cloud instance?",
label: "Is Cloud",
name: "is_cloud",
optional: false,
default: true,
description:
"Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
},
],
},
jira: {
@ -817,7 +845,10 @@ export interface GmailConfig {}
export interface BookstackConfig {}
export interface ConfluenceConfig {
wiki_page_url: string;
wiki_base: string;
space: string;
page_id?: string;
is_cloud?: boolean;
index_recursively?: boolean;
}