mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 15:30:59 +02:00
More explicit Confluence Connector (#2289)
This commit is contained in:
parent
f871b4c6eb
commit
c122be2f6a
@ -10,6 +10,9 @@ on:
|
|||||||
env:
|
env:
|
||||||
# Confluence
|
# Confluence
|
||||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||||
|
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||||
|
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||||
|
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||||
|
|
||||||
|
@ -0,0 +1,158 @@
|
|||||||
|
"""migration confluence to be explicit
|
||||||
|
|
||||||
|
Revision ID: a3795dce87be
|
||||||
|
Revises: 1f60f60c3401
|
||||||
|
Create Date: 2024-09-01 13:52:12.006740
|
||||||
|
|
||||||
|
"""
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
from sqlalchemy.sql import table, column
|
||||||
|
|
||||||
|
revision = "a3795dce87be"
|
||||||
|
down_revision = "1f60f60c3401"
|
||||||
|
branch_labels = None
|
||||||
|
depends_on = None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
|
||||||
|
parsed_url = urlparse(wiki_url)
|
||||||
|
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
|
||||||
|
path_parts = parsed_url.path.split("/")
|
||||||
|
space = path_parts[3]
|
||||||
|
page_id = path_parts[5] if len(path_parts) > 5 else ""
|
||||||
|
return wiki_base, space, page_id
|
||||||
|
|
||||||
|
def _extract_confluence_keys_from_datacenter_url(
|
||||||
|
wiki_url: str,
|
||||||
|
) -> tuple[str, str, str]:
|
||||||
|
DISPLAY = "/display/"
|
||||||
|
PAGE = "/pages/"
|
||||||
|
parsed_url = urlparse(wiki_url)
|
||||||
|
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
|
||||||
|
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
||||||
|
page_id = ""
|
||||||
|
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
|
||||||
|
page_id = content[1]
|
||||||
|
return wiki_base, space, page_id
|
||||||
|
|
||||||
|
is_confluence_cloud = (
|
||||||
|
".atlassian.net/wiki/spaces/" in wiki_url
|
||||||
|
or ".jira.com/wiki/spaces/" in wiki_url
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_confluence_cloud:
|
||||||
|
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
|
||||||
|
else:
|
||||||
|
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
|
||||||
|
wiki_url
|
||||||
|
)
|
||||||
|
|
||||||
|
return wiki_base, space, page_id, is_confluence_cloud
|
||||||
|
|
||||||
|
|
||||||
|
def reconstruct_confluence_url(
|
||||||
|
wiki_base: str, space: str, page_id: str, is_cloud: bool
|
||||||
|
) -> str:
|
||||||
|
if is_cloud:
|
||||||
|
url = f"{wiki_base}/spaces/{space}"
|
||||||
|
if page_id:
|
||||||
|
url += f"/pages/{page_id}"
|
||||||
|
else:
|
||||||
|
url = f"{wiki_base}/display/{space}"
|
||||||
|
if page_id:
|
||||||
|
url += f"/pages/{page_id}"
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
connector = table(
|
||||||
|
"connector",
|
||||||
|
column("id", sa.Integer),
|
||||||
|
column("source", sa.String()),
|
||||||
|
column("input_type", sa.String()),
|
||||||
|
column("connector_specific_config", postgresql.JSONB),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch all Confluence connectors
|
||||||
|
connection = op.get_bind()
|
||||||
|
confluence_connectors = connection.execute(
|
||||||
|
sa.select(connector).where(
|
||||||
|
sa.and_(
|
||||||
|
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
for row in confluence_connectors:
|
||||||
|
config = row.connector_specific_config
|
||||||
|
wiki_page_url = config["wiki_page_url"]
|
||||||
|
wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
|
||||||
|
wiki_page_url
|
||||||
|
)
|
||||||
|
|
||||||
|
new_config = {
|
||||||
|
"wiki_base": wiki_base,
|
||||||
|
"space": space,
|
||||||
|
"page_id": page_id,
|
||||||
|
"is_cloud": is_cloud,
|
||||||
|
}
|
||||||
|
|
||||||
|
for key, value in config.items():
|
||||||
|
if key not in ["wiki_page_url"]:
|
||||||
|
new_config[key] = value
|
||||||
|
|
||||||
|
op.execute(
|
||||||
|
connector.update()
|
||||||
|
.where(connector.c.id == row.id)
|
||||||
|
.values(connector_specific_config=new_config)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
connector = table(
|
||||||
|
"connector",
|
||||||
|
column("id", sa.Integer),
|
||||||
|
column("source", sa.String()),
|
||||||
|
column("input_type", sa.String()),
|
||||||
|
column("connector_specific_config", postgresql.JSONB),
|
||||||
|
)
|
||||||
|
|
||||||
|
confluence_connectors = (
|
||||||
|
op.get_bind()
|
||||||
|
.execute(
|
||||||
|
sa.select(connector).where(
|
||||||
|
connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.fetchall()
|
||||||
|
)
|
||||||
|
|
||||||
|
for row in confluence_connectors:
|
||||||
|
config = row.connector_specific_config
|
||||||
|
if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
|
||||||
|
wiki_page_url = reconstruct_confluence_url(
|
||||||
|
config["wiki_base"],
|
||||||
|
config["space"],
|
||||||
|
config.get("page_id", ""),
|
||||||
|
config["is_cloud"],
|
||||||
|
)
|
||||||
|
|
||||||
|
new_config = {"wiki_page_url": wiki_page_url}
|
||||||
|
new_config.update(
|
||||||
|
{
|
||||||
|
k: v
|
||||||
|
for k, v in config.items()
|
||||||
|
if k not in ["wiki_base", "space", "page_id", "is_cloud"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
op.execute(
|
||||||
|
connector.update()
|
||||||
|
.where(connector.c.id == row.id)
|
||||||
|
.values(connector_specific_config=new_config)
|
||||||
|
)
|
@ -7,7 +7,6 @@ from datetime import timezone
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
from atlassian import Confluence # type:ignore
|
from atlassian import Confluence # type:ignore
|
||||||
@ -53,79 +52,6 @@ NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
|
|
||||||
"""Sample
|
|
||||||
URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
|
|
||||||
URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
|
|
||||||
|
|
||||||
wiki_base is https://danswer.atlassian.net/wiki
|
|
||||||
space is 1234abcd
|
|
||||||
page_id is 5678efgh
|
|
||||||
"""
|
|
||||||
parsed_url = urlparse(wiki_url)
|
|
||||||
wiki_base = (
|
|
||||||
parsed_url.scheme
|
|
||||||
+ "://"
|
|
||||||
+ parsed_url.netloc
|
|
||||||
+ parsed_url.path.split("/spaces")[0]
|
|
||||||
)
|
|
||||||
|
|
||||||
path_parts = parsed_url.path.split("/")
|
|
||||||
space = path_parts[3]
|
|
||||||
|
|
||||||
page_id = path_parts[5] if len(path_parts) > 5 else ""
|
|
||||||
return wiki_base, space, page_id
|
|
||||||
|
|
||||||
|
|
||||||
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
|
|
||||||
"""Sample
|
|
||||||
URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
|
|
||||||
URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
|
|
||||||
wiki_base is https://danswer.ai/confluence
|
|
||||||
space is 1234abcd
|
|
||||||
page_id is 5678efgh
|
|
||||||
"""
|
|
||||||
# /display/ is always right before the space and at the end of the base print()
|
|
||||||
DISPLAY = "/display/"
|
|
||||||
PAGE = "/pages/"
|
|
||||||
|
|
||||||
parsed_url = urlparse(wiki_url)
|
|
||||||
wiki_base = (
|
|
||||||
parsed_url.scheme
|
|
||||||
+ "://"
|
|
||||||
+ parsed_url.netloc
|
|
||||||
+ parsed_url.path.split(DISPLAY)[0]
|
|
||||||
)
|
|
||||||
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
|
||||||
page_id = ""
|
|
||||||
if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
|
|
||||||
page_id = content[1]
|
|
||||||
return wiki_base, space, page_id
|
|
||||||
|
|
||||||
|
|
||||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
|
|
||||||
is_confluence_cloud = (
|
|
||||||
".atlassian.net/wiki/spaces/" in wiki_url
|
|
||||||
or ".jira.com/wiki/spaces/" in wiki_url
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if is_confluence_cloud:
|
|
||||||
wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
|
|
||||||
wiki_url
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
|
|
||||||
wiki_url
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
|
|
||||||
logger.error(error_msg)
|
|
||||||
raise ValueError(error_msg)
|
|
||||||
|
|
||||||
return wiki_base, space, page_id, is_confluence_cloud
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def _get_user(user_id: str, confluence_client: Confluence) -> str:
|
def _get_user(user_id: str, confluence_client: Confluence) -> str:
|
||||||
"""Get Confluence Display Name based on the account-id or userkey value
|
"""Get Confluence Display Name based on the account-id or userkey value
|
||||||
@ -372,7 +298,10 @@ class RecursiveIndexer:
|
|||||||
class ConfluenceConnector(LoadConnector, PollConnector):
|
class ConfluenceConnector(LoadConnector, PollConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
wiki_page_url: str,
|
wiki_base: str,
|
||||||
|
space: str,
|
||||||
|
is_cloud: bool,
|
||||||
|
page_id: str = "",
|
||||||
index_recursively: bool = True,
|
index_recursively: bool = True,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||||
@ -386,15 +315,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
self.labels_to_skip = set(labels_to_skip)
|
self.labels_to_skip = set(labels_to_skip)
|
||||||
self.recursive_indexer: RecursiveIndexer | None = None
|
self.recursive_indexer: RecursiveIndexer | None = None
|
||||||
self.index_recursively = index_recursively
|
self.index_recursively = index_recursively
|
||||||
(
|
|
||||||
self.wiki_base,
|
# Remove trailing slash from wiki_base if present
|
||||||
self.space,
|
self.wiki_base = wiki_base.rstrip("/")
|
||||||
self.page_id,
|
self.space = space
|
||||||
self.is_cloud,
|
self.page_id = page_id
|
||||||
) = extract_confluence_keys_from_url(wiki_page_url)
|
|
||||||
|
self.is_cloud = is_cloud
|
||||||
|
|
||||||
self.space_level_scan = False
|
self.space_level_scan = False
|
||||||
|
|
||||||
self.confluence_client: Confluence | None = None
|
self.confluence_client: Confluence | None = None
|
||||||
|
|
||||||
if self.page_id is None or self.page_id == "":
|
if self.page_id is None or self.page_id == "":
|
||||||
@ -414,7 +343,6 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
username=username if self.is_cloud else None,
|
username=username if self.is_cloud else None,
|
||||||
password=access_token if self.is_cloud else None,
|
password=access_token if self.is_cloud else None,
|
||||||
token=access_token if not self.is_cloud else None,
|
token=access_token if not self.is_cloud else None,
|
||||||
cloud=self.is_cloud,
|
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -866,7 +794,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
connector = ConfluenceConnector(
|
||||||
|
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
|
||||||
|
space=os.environ["CONFLUENCE_TEST_SPACE"],
|
||||||
|
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
|
||||||
|
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
|
||||||
|
index_recursively=True,
|
||||||
|
)
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{
|
{
|
||||||
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
||||||
|
@ -8,7 +8,13 @@ from danswer.connectors.confluence.connector import ConfluenceConnector
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def confluence_connector() -> ConfluenceConnector:
|
def confluence_connector() -> ConfluenceConnector:
|
||||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
connector = ConfluenceConnector(
|
||||||
|
wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
|
||||||
|
space=os.environ["CONFLUENCE_TEST_SPACE"],
|
||||||
|
is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
|
||||||
|
page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
|
||||||
|
)
|
||||||
|
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{
|
{
|
||||||
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
|
||||||
|
@ -48,10 +48,16 @@ export const ConnectorTitle = ({
|
|||||||
);
|
);
|
||||||
} else if (connector.source === "confluence") {
|
} else if (connector.source === "confluence") {
|
||||||
const typedConnector = connector as Connector<ConfluenceConfig>;
|
const typedConnector = connector as Connector<ConfluenceConfig>;
|
||||||
additionalMetadata.set(
|
const wikiUrl = typedConnector.connector_specific_config.is_cloud
|
||||||
"Wiki URL",
|
? `${typedConnector.connector_specific_config.wiki_base}/wiki/spaces/${typedConnector.connector_specific_config.space}`
|
||||||
typedConnector.connector_specific_config.wiki_page_url
|
: `${typedConnector.connector_specific_config.wiki_base}/spaces/${typedConnector.connector_specific_config.space}`;
|
||||||
);
|
additionalMetadata.set("Wiki URL", wikiUrl);
|
||||||
|
if (typedConnector.connector_specific_config.page_id) {
|
||||||
|
additionalMetadata.set(
|
||||||
|
"Page ID",
|
||||||
|
typedConnector.connector_specific_config.page_id
|
||||||
|
);
|
||||||
|
}
|
||||||
} else if (connector.source === "jira") {
|
} else if (connector.source === "jira") {
|
||||||
const typedConnector = connector as Connector<JiraConfig>;
|
const typedConnector = connector as Connector<JiraConfig>;
|
||||||
additionalMetadata.set(
|
additionalMetadata.set(
|
||||||
|
@ -219,19 +219,37 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
|
|||||||
},
|
},
|
||||||
confluence: {
|
confluence: {
|
||||||
description: "Configure Confluence connector",
|
description: "Configure Confluence connector",
|
||||||
subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL.
|
subtext: `Specify the base URL of your Confluence instance, the space name, and optionally a specific page ID to index. If no page ID is provided, the entire space will be indexed.
|
||||||
|
|
||||||
For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children).
|
|
||||||
|
|
||||||
Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.`,
|
For example, entering "https://pablosfsanchez.atlassian.net/wiki" as the Wiki Base URL, "KB" as the Space, and "164331" as the Page ID will index the specific page at https://pablosfsanchez.atlassian.net/wiki/spaces/KB/pages/164331/Page. If you leave the Page ID empty, it will index the entire KB space.
|
||||||
|
|
||||||
|
Selecting the "Index Recursively" checkbox will index the specified page and all of its children.`,
|
||||||
values: [
|
values: [
|
||||||
{
|
{
|
||||||
type: "text",
|
type: "text",
|
||||||
query: "Enter the wiki page URL:",
|
query: "Enter the wiki base URL:",
|
||||||
label: "Wiki Page URL",
|
label: "Wiki Base URL",
|
||||||
name: "wiki_page_url",
|
name: "wiki_base",
|
||||||
optional: false,
|
optional: false,
|
||||||
description: "Enter any link to a Confluence space or Page",
|
description:
|
||||||
|
"The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
query: "Enter the space:",
|
||||||
|
label: "Space",
|
||||||
|
name: "space",
|
||||||
|
optional: false,
|
||||||
|
description: "The Confluence space name to index (e.g. `KB`)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: "text",
|
||||||
|
query: "Enter the page ID (optional):",
|
||||||
|
label: "Page ID",
|
||||||
|
name: "page_id",
|
||||||
|
optional: true,
|
||||||
|
description:
|
||||||
|
"Specific page ID to index - leave empty to index the entire space (e.g. `131368`)",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
type: "checkbox",
|
type: "checkbox",
|
||||||
@ -241,6 +259,16 @@ Selecting the "Index Recursively" checkbox will index the single page's children
|
|||||||
name: "index_recursively",
|
name: "index_recursively",
|
||||||
optional: false,
|
optional: false,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
type: "checkbox",
|
||||||
|
query: "Is this a Confluence Cloud instance?",
|
||||||
|
label: "Is Cloud",
|
||||||
|
name: "is_cloud",
|
||||||
|
optional: false,
|
||||||
|
default: true,
|
||||||
|
description:
|
||||||
|
"Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
jira: {
|
jira: {
|
||||||
@ -817,7 +845,10 @@ export interface GmailConfig {}
|
|||||||
export interface BookstackConfig {}
|
export interface BookstackConfig {}
|
||||||
|
|
||||||
export interface ConfluenceConfig {
|
export interface ConfluenceConfig {
|
||||||
wiki_page_url: string;
|
wiki_base: string;
|
||||||
|
space: string;
|
||||||
|
page_id?: string;
|
||||||
|
is_cloud?: boolean;
|
||||||
index_recursively?: boolean;
|
index_recursively?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user