mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 03:48:14 +02:00
More explicit Confluence Connector (#2289)
This commit is contained in:
parent
f871b4c6eb
commit
c122be2f6a
@ -10,6 +10,9 @@ on:
|
||||
env:
|
||||
# Confluence
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
|
||||
|
@ -0,0 +1,158 @@
|
||||
"""migration confluence to be explicit
|
||||
|
||||
Revision ID: a3795dce87be
|
||||
Revises: 1f60f60c3401
|
||||
Create Date: 2024-09-01 13:52:12.006740
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy.sql import table, column
|
||||
|
||||
revision = "a3795dce87be"
|
||||
down_revision = "1f60f60c3401"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
|
||||
path_parts = parsed_url.path.split("/")
|
||||
space = path_parts[3]
|
||||
page_id = path_parts[5] if len(path_parts) > 5 else ""
|
||||
return wiki_base, space, page_id
|
||||
|
||||
def _extract_confluence_keys_from_datacenter_url(
|
||||
wiki_url: str,
|
||||
) -> tuple[str, str, str]:
|
||||
DISPLAY = "/display/"
|
||||
PAGE = "/pages/"
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
|
||||
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
||||
page_id = ""
|
||||
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
|
||||
page_id = content[1]
|
||||
return wiki_base, space, page_id
|
||||
|
||||
is_confluence_cloud = (
|
||||
".atlassian.net/wiki/spaces/" in wiki_url
|
||||
or ".jira.com/wiki/spaces/" in wiki_url
|
||||
)
|
||||
|
||||
if is_confluence_cloud:
|
||||
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
|
||||
else:
|
||||
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
|
||||
wiki_url
|
||||
)
|
||||
|
||||
return wiki_base, space, page_id, is_confluence_cloud
|
||||
|
||||
|
||||
def reconstruct_confluence_url(
|
||||
wiki_base: str, space: str, page_id: str, is_cloud: bool
|
||||
) -> str:
|
||||
if is_cloud:
|
||||
url = f"{wiki_base}/spaces/{space}"
|
||||
if page_id:
|
||||
url += f"/pages/{page_id}"
|
||||
else:
|
||||
url = f"{wiki_base}/display/{space}"
|
||||
if page_id:
|
||||
url += f"/pages/{page_id}"
|
||||
return url
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
connector = table(
|
||||
"connector",
|
||||
column("id", sa.Integer),
|
||||
column("source", sa.String()),
|
||||
column("input_type", sa.String()),
|
||||
column("connector_specific_config", postgresql.JSONB),
|
||||
)
|
||||
|
||||
# Fetch all Confluence connectors
|
||||
connection = op.get_bind()
|
||||
confluence_connectors = connection.execute(
|
||||
sa.select(connector).where(
|
||||
sa.and_(
|
||||
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
|
||||
)
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for row in confluence_connectors:
|
||||
config = row.connector_specific_config
|
||||
wiki_page_url = config["wiki_page_url"]
|
||||
wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
|
||||
wiki_page_url
|
||||
)
|
||||
|
||||
new_config = {
|
||||
"wiki_base": wiki_base,
|
||||
"space": space,
|
||||
"page_id": page_id,
|
||||
"is_cloud": is_cloud,
|
||||
}
|
||||
|
||||
for key, value in config.items():
|
||||
if key not in ["wiki_page_url"]:
|
||||
new_config[key] = value
|
||||
|
||||
op.execute(
|
||||
connector.update()
|
||||
.where(connector.c.id == row.id)
|
||||
.values(connector_specific_config=new_config)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
connector = table(
|
||||
"connector",
|
||||
column("id", sa.Integer),
|
||||
column("source", sa.String()),
|
||||
column("input_type", sa.String()),
|
||||
column("connector_specific_config", postgresql.JSONB),
|
||||
)
|
||||
|
||||
confluence_connectors = (
|
||||
op.get_bind()
|
||||
.execute(
|
||||
sa.select(connector).where(
|
||||
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
|
||||
)
|
||||
)
|
||||
.fetchall()
|
||||
)
|
||||
|
||||
for row in confluence_connectors:
|
||||
config = row.connector_specific_config
|
||||
if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
|
||||
wiki_page_url = reconstruct_confluence_url(
|
||||
config["wiki_base"],
|
||||
config["space"],
|
||||
config.get("page_id", ""),
|
||||
config["is_cloud"],
|
||||
)
|
||||
|
||||
new_config = {"wiki_page_url": wiki_page_url}
|
||||
new_config.update(
|
||||
{
|
||||
k: v
|
||||
for k, v in config.items()
|
||||
if k not in ["wiki_base", "space", "page_id", "is_cloud"]
|
||||
}
|
||||
)
|
||||
|
||||
op.execute(
|
||||
connector.update()
|
||||
.where(connector.c.id == row.id)
|
||||
.values(connector_specific_config=new_config)
|
||||
)
|
@ -7,7 +7,6 @@ from datetime import timezone
|
||||
from functools import lru_cache
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import bs4
|
||||
from atlassian import Confluence # type:ignore
|
||||
@ -53,79 +52,6 @@ NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
|
||||
)
|
||||
|
||||
|
||||
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
|
||||
"""Sample
|
||||
URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
|
||||
URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
|
||||
|
||||
wiki_base is https://danswer.atlassian.net/wiki
|
||||
space is 1234abcd
|
||||
page_id is 5678efgh
|
||||
"""
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = (
|
||||
parsed_url.scheme
|
||||
+ "://"
|
||||
+ parsed_url.netloc
|
||||
+ parsed_url.path.split("/spaces")[0]
|
||||
)
|
||||
|
||||
path_parts = parsed_url.path.split("/")
|
||||
space = path_parts[3]
|
||||
|
||||
page_id = path_parts[5] if len(path_parts) > 5 else ""
|
||||
return wiki_base, space, page_id
|
||||
|
||||
|
||||
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
|
||||
"""Sample
|
||||
URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
|
||||
URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
|
||||
wiki_base is https://danswer.ai/confluence
|
||||
space is 1234abcd
|
||||
page_id is 5678efgh
|
||||
"""
|
||||
# /display/ is always right before the space and at the end of the base print()
|
||||
DISPLAY = "/display/"
|
||||
PAGE = "/pages/"
|
||||
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = (
|
||||
parsed_url.scheme
|
||||
+ "://"
|
||||
+ parsed_url.netloc
|
||||
+ parsed_url.path.split(DISPLAY)[0]
|
||||
)
|
||||
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
||||
page_id = ""
|
||||
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
|
||||
page_id = content[1]
|
||||
return wiki_base, space, page_id
|
||||
|
||||
|
||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
|
||||
is_confluence_cloud = (
|
||||
".atlassian.net/wiki/spaces/" in wiki_url
|
||||
or ".jira.com/wiki/spaces/" in wiki_url
|
||||
)
|
||||
|
||||
try:
|
||||
if is_confluence_cloud:
|
||||
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
|
||||
wiki_url
|
||||
)
|
||||
else:
|
||||
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
|
||||
wiki_url
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
return wiki_base, space, page_id, is_confluence_cloud
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _get_user(user_id: str, confluence_client: Confluence) -> str:
|
||||
"""Get Confluence Display Name based on the account-id or userkey value
|
||||
@ -372,7 +298,10 @@ class RecursiveIndexer:
|
||||
class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
def __init__(
|
||||
self,
|
||||
wiki_page_url: str,
|
||||
wiki_base: str,
|
||||
space: str,
|
||||
is_cloud: bool,
|
||||
page_id: str = "",
|
||||
index_recursively: bool = True,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||
@ -386,15 +315,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
self.labels_to_skip = set(labels_to_skip)
|
||||
self.recursive_indexer: RecursiveIndexer | None = None
|
||||
self.index_recursively = index_recursively
|
||||
(
|
||||
self.wiki_base,
|
||||
self.space,
|
||||
self.page_id,
|
||||
self.is_cloud,
|
||||
) = extract_confluence_keys_from_url(wiki_page_url)
|
||||
|
||||
# Remove trailing slash from wiki_base if present
|
||||
self.wiki_base = wiki_base.rstrip("/")
|
||||
self.space = space
|
||||
self.page_id = page_id
|
||||
|
||||
self.is_cloud = is_cloud
|
||||
|
||||
self.space_level_scan = False
|
||||
|
||||
self.confluence_client: Confluence | None = None
|
||||
|
||||
if self.page_id is None or self.page_id == "":
|
||||
@ -414,7 +343,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
username=username if self.is_cloud else None,
|
||||
password=access_token if self.is_cloud else None,
|
||||
token=access_token if not self.is_cloud else None,
|
||||
cloud=self.is_cloud,
|
||||
)
|
||||
return None
|
||||
|
||||
@ -866,7 +794,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
||||
connector = ConfluenceConnector(
|
||||
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
|
||||
space=os.environ["CONFLUENCE_TEST_SPACE"],
|
||||
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
|
||||
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
|
||||
index_recursively=True,
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
||||
|
@ -8,7 +8,13 @@ from danswer.connectors.confluence.connector import ConfluenceConnector
|
||||
|
||||
@pytest.fixture
|
||||
def confluence_connector() -> ConfluenceConnector:
|
||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
||||
connector = ConfluenceConnector(
|
||||
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
|
||||
space=os.environ["CONFLUENCE_TEST_SPACE"],
|
||||
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
|
||||
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
|
||||
)
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
||||
|
@ -48,10 +48,16 @@ export const ConnectorTitle = ({
|
||||
);
|
||||
} else if (connector.source === "confluence") {
|
||||
const typedConnector = connector as Connector<ConfluenceConfig>;
|
||||
additionalMetadata.set(
|
||||
"Wiki URL",
|
||||
typedConnector.connector_specific_config.wiki_page_url
|
||||
);
|
||||
const wikiUrl = typedConnector.connector_specific_config.is_cloud
|
||||
? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
|
||||
: `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
|
||||
additionalMetadata.set("Wiki URL", wikiUrl);
|
||||
if (typedConnector.connector_specific_config.page_id) {
|
||||
additionalMetadata.set(
|
||||
"Page ID",
|
||||
typedConnector.connector_specific_config.page_id
|
||||
);
|
||||
}
|
||||
} else if (connector.source === "jira") {
|
||||
const typedConnector = connector as Connector<JiraConfig>;
|
||||
additionalMetadata.set(
|
||||
|
@ -219,19 +219,37 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
|
||||
},
|
||||
confluence: {
|
||||
description: "Configure Confluence connector",
|
||||
subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL.
|
||||
|
||||
For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children).
|
||||
subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed.
|
||||
|
||||
Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.`,
|
||||
For example, entering "https://pablosfsanchez.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https://pablosfsanchez.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space.
|
||||
|
||||
Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`,
|
||||
values: [
|
||||
{
|
||||
type: "text",
|
||||
query: "Enter the wiki page URL:",
|
||||
label: "Wiki Page URL",
|
||||
name: "wiki_page_url",
|
||||
query: "Enter the wiki base URL:",
|
||||
label: "Wiki Base URL",
|
||||
name: "wiki_base",
|
||||
optional: false,
|
||||
description: "Enter any link to a Confluence space or Page",
|
||||
description:
|
||||
"The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)",
|
||||
},
|
||||
{
|
||||
type: "text",
|
||||
query: "Enter the space:",
|
||||
label: "Space",
|
||||
name: "space",
|
||||
optional: false,
|
||||
description: "The Confluence space name to index (e.g. `KB`)",
|
||||
},
|
||||
{
|
||||
type: "text",
|
||||
query: "Enter the page ID (optional):",
|
||||
label: "Page ID",
|
||||
name: "page_id",
|
||||
optional: true,
|
||||
description:
|
||||
"Specific page ID to index - leave empty to index the entire space (e.g. `131368`)",
|
||||
},
|
||||
{
|
||||
type: "checkbox",
|
||||
@ -241,6 +259,16 @@ Selecting the "Index Recursively" checkbox will index the single page's children
|
||||
name: "index_recursively",
|
||||
optional: false,
|
||||
},
|
||||
{
|
||||
type: "checkbox",
|
||||
query: "Is this a Confluence Cloud instance?",
|
||||
label: "Is Cloud",
|
||||
name: "is_cloud",
|
||||
optional: false,
|
||||
default: true,
|
||||
description:
|
||||
"Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
|
||||
},
|
||||
],
|
||||
},
|
||||
jira: {
|
||||
@ -817,7 +845,10 @@ export interface GmailConfig {}
|
||||
export interface BookstackConfig {}
|
||||
|
||||
export interface ConfluenceConfig {
|
||||
wiki_page_url: string;
|
||||
wiki_base: string;
|
||||
space: string;
|
||||
page_id?: string;
|
||||
is_cloud?: boolean;
|
||||
index_recursively?: boolean;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user