mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
Support Confluence data center + allow for specifying labels to ignore (#624)
This commit is contained in:
@@ -109,9 +109,11 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
|
|||||||
#####
|
#####
|
||||||
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||||
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
||||||
|
|
||||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
||||||
@@ -128,6 +130,14 @@ NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
|
|||||||
== "true"
|
== "true"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
|
||||||
|
ignored_tag
|
||||||
|
for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
|
||||||
|
","
|
||||||
|
)
|
||||||
|
if ignored_tag
|
||||||
|
]
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Query Configs
|
# Query Configs
|
||||||
#####
|
#####
|
||||||
|
@@ -9,6 +9,7 @@ from urllib.parse import urlparse
|
|||||||
from atlassian import Confluence # type:ignore
|
from atlassian import Confluence # type:ignore
|
||||||
from requests import HTTPError
|
from requests import HTTPError
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
||||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
@@ -30,17 +31,12 @@ logger = setup_logger()
|
|||||||
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
||||||
|
|
||||||
|
|
||||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str]:
|
||||||
"""Sample
|
"""Sample
|
||||||
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
|
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
|
||||||
wiki_base is danswer.atlassian.net/wiki
|
wiki_base is https://danswer.atlassian.net/wiki
|
||||||
space is 1234abcd
|
space is 1234abcd
|
||||||
"""
|
"""
|
||||||
if ".atlassian.net/wiki/spaces/" not in wiki_url:
|
|
||||||
raise ValueError(
|
|
||||||
"Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
|
|
||||||
)
|
|
||||||
|
|
||||||
parsed_url = urlparse(wiki_url)
|
parsed_url = urlparse(wiki_url)
|
||||||
wiki_base = (
|
wiki_base = (
|
||||||
parsed_url.scheme
|
parsed_url.scheme
|
||||||
@@ -52,6 +48,42 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
|||||||
return wiki_base, space
|
return wiki_base, space
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str]:
|
||||||
|
"""Sample
|
||||||
|
https://danswer.ai/confluence/display/1234abcd/overview
|
||||||
|
wiki_base is https://danswer.ai/confluence
|
||||||
|
space is 1234abcd
|
||||||
|
"""
|
||||||
|
# /display/ is always right before the space and at the end of the base url
|
||||||
|
DISPLAY = "/display/"
|
||||||
|
|
||||||
|
parsed_url = urlparse(wiki_url)
|
||||||
|
wiki_base = (
|
||||||
|
parsed_url.scheme
|
||||||
|
+ "://"
|
||||||
|
+ parsed_url.netloc
|
||||||
|
+ parsed_url.path.split(DISPLAY)[0]
|
||||||
|
)
|
||||||
|
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
||||||
|
return wiki_base, space
|
||||||
|
|
||||||
|
|
||||||
|
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
|
||||||
|
is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url
|
||||||
|
|
||||||
|
try:
|
||||||
|
if is_confluence_cloud:
|
||||||
|
wiki_base, space = _extract_confluence_keys_from_cloud_url(wiki_url)
|
||||||
|
else:
|
||||||
|
wiki_base, space = _extract_confluence_keys_from_datacenter_url(wiki_url)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base and space names. Exception: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
|
raise ValueError(error_msg)
|
||||||
|
|
||||||
|
return wiki_base, space, is_confluence_cloud
|
||||||
|
|
||||||
|
|
||||||
def _comment_dfs(
|
def _comment_dfs(
|
||||||
comments_str: str,
|
comments_str: str,
|
||||||
comment_pages: Collection[dict[str, Any]],
|
comment_pages: Collection[dict[str, Any]],
|
||||||
@@ -79,10 +111,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
wiki_page_url: str,
|
wiki_page_url: str,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||||
|
# if a page has one of the labels specified in this list, we will just
|
||||||
|
# skip it. This is generally used to avoid indexing extra sensitive
|
||||||
|
# pages.
|
||||||
|
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
|
self.labels_to_skip = set(labels_to_skip)
|
||||||
|
self.wiki_base, self.space, self.is_cloud = extract_confluence_keys_from_url(
|
||||||
|
wiki_page_url
|
||||||
|
)
|
||||||
self.confluence_client: Confluence | None = None
|
self.confluence_client: Confluence | None = None
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
@@ -90,9 +129,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
access_token = credentials["confluence_access_token"]
|
access_token = credentials["confluence_access_token"]
|
||||||
self.confluence_client = Confluence(
|
self.confluence_client = Confluence(
|
||||||
url=self.wiki_base,
|
url=self.wiki_base,
|
||||||
username=username,
|
# passing in username causes issues for Confluence data center
|
||||||
|
username=username if self.is_cloud else None,
|
||||||
password=access_token,
|
password=access_token,
|
||||||
cloud=True,
|
cloud=self.is_cloud,
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -185,6 +225,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
)
|
)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
|
||||||
|
try:
|
||||||
|
labels_response = confluence_client.get_page_labels(page_id)
|
||||||
|
return [label["name"] for label in labels_response["results"]]
|
||||||
|
except Exception as e:
|
||||||
|
if not self.continue_on_failure:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
logger.exception("Ran into exception when fetching labels from Confluence")
|
||||||
|
return []
|
||||||
|
|
||||||
def _get_doc_batch(
|
def _get_doc_batch(
|
||||||
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
||||||
) -> tuple[list[Document], int]:
|
) -> tuple[list[Document], int]:
|
||||||
@@ -200,6 +251,19 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
last_modified = datetime.fromisoformat(last_modified_str)
|
last_modified = datetime.fromisoformat(last_modified_str)
|
||||||
|
|
||||||
if time_filter is None or time_filter(last_modified):
|
if time_filter is None or time_filter(last_modified):
|
||||||
|
page_id = page["id"]
|
||||||
|
|
||||||
|
# check disallowed labels
|
||||||
|
if self.labels_to_skip:
|
||||||
|
page_labels = self._fetch_labels(self.confluence_client, page_id)
|
||||||
|
label_intersection = self.labels_to_skip.intersection(page_labels)
|
||||||
|
if label_intersection:
|
||||||
|
logger.info(
|
||||||
|
f"Page with ID '{page_id}' has a label which has been "
|
||||||
|
f"designated as disallowed: {label_intersection}. Skipping."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
page_html = (
|
page_html = (
|
||||||
page["body"]
|
page["body"]
|
||||||
.get("storage", page["body"].get("view", {}))
|
.get("storage", page["body"].get("view", {}))
|
||||||
@@ -212,7 +276,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
page_text = (
|
page_text = (
|
||||||
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
||||||
)
|
)
|
||||||
comments_text = self._fetch_comments(self.confluence_client, page["id"])
|
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
||||||
page_text += comments_text
|
page_text += comments_text
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
|
@@ -74,8 +74,10 @@ services:
|
|||||||
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
|
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
|
||||||
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
|
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
|
||||||
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
|
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
|
||||||
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
|
|
||||||
- NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-}
|
- NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-}
|
||||||
|
# Connector Configs
|
||||||
|
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
|
||||||
|
- CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-}
|
||||||
# Danswer SlackBot Configs
|
# Danswer SlackBot Configs
|
||||||
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
|
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
|
||||||
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
|
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
|
||||||
|
@@ -20,17 +20,37 @@ import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsT
|
|||||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||||
import { usePublicCredentials } from "@/lib/hooks";
|
import { usePublicCredentials } from "@/lib/hooks";
|
||||||
|
|
||||||
// Copied from the `extract_confluence_keys_from_url` function
|
const extractSpaceFromCloudUrl = (wikiUrl: string): string => {
|
||||||
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
|
|
||||||
if (!wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const parsedUrl = new URL(wikiUrl);
|
const parsedUrl = new URL(wikiUrl);
|
||||||
const space = parsedUrl.pathname.split("/")[3];
|
const space = parsedUrl.pathname.split("/")[3];
|
||||||
return space;
|
return space;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const extractSpaceFromDataCenterUrl = (wikiUrl: string): string => {
|
||||||
|
const DISPLAY = "/display/";
|
||||||
|
|
||||||
|
const parsedUrl = new URL(wikiUrl);
|
||||||
|
const spaceSpecificSection = parsedUrl.pathname
|
||||||
|
.split(DISPLAY)
|
||||||
|
.slice(1)
|
||||||
|
.join(DISPLAY);
|
||||||
|
const space = spaceSpecificSection.split("/")[0];
|
||||||
|
return space;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Copied from the `extract_confluence_keys_from_url` function
|
||||||
|
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
|
||||||
|
try {
|
||||||
|
if (wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
|
||||||
|
return extractSpaceFromCloudUrl(wikiUrl);
|
||||||
|
}
|
||||||
|
return extractSpaceFromDataCenterUrl(wikiUrl);
|
||||||
|
} catch (e) {
|
||||||
|
console.log("Failed to extract space from url", e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const Main = () => {
|
const Main = () => {
|
||||||
const { popup, setPopup } = usePopup();
|
const { popup, setPopup } = usePopup();
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user