mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-07 11:28:09 +02:00
Support Confluence data center + allow for specifying labels to ignore (#624)
This commit is contained in:
parent
17bd68be4c
commit
ef2b445201
@ -109,9 +109,11 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
|
||||
#####
|
||||
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
||||
|
||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||
)
|
||||
|
||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
||||
@ -128,6 +130,14 @@ NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
|
||||
== "true"
|
||||
)
|
||||
|
||||
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
|
||||
ignored_tag
|
||||
for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
|
||||
","
|
||||
)
|
||||
if ignored_tag
|
||||
]
|
||||
|
||||
#####
|
||||
# Query Configs
|
||||
#####
|
||||
|
@ -9,6 +9,7 @@ from urllib.parse import urlparse
|
||||
from atlassian import Confluence # type:ignore
|
||||
from requests import HTTPError
|
||||
|
||||
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
|
||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
@ -30,17 +31,12 @@ logger = setup_logger()
|
||||
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
||||
|
||||
|
||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
||||
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str]:
|
||||
"""Sample
|
||||
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
|
||||
wiki_base is danswer.atlassian.net/wiki
|
||||
wiki_base is https://danswer.atlassian.net/wiki
|
||||
space is 1234abcd
|
||||
"""
|
||||
if ".atlassian.net/wiki/spaces/" not in wiki_url:
|
||||
raise ValueError(
|
||||
"Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
|
||||
)
|
||||
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = (
|
||||
parsed_url.scheme
|
||||
@ -52,6 +48,42 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
||||
return wiki_base, space
|
||||
|
||||
|
||||
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str]:
|
||||
"""Sample
|
||||
https://danswer.ai/confluence/display/1234abcd/overview
|
||||
wiki_base is https://danswer.ai/confluence
|
||||
space is 1234abcd
|
||||
"""
|
||||
# /display/ is always right before the space and at the end of the base url
|
||||
DISPLAY = "/display/"
|
||||
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = (
|
||||
parsed_url.scheme
|
||||
+ "://"
|
||||
+ parsed_url.netloc
|
||||
+ parsed_url.path.split(DISPLAY)[0]
|
||||
)
|
||||
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
|
||||
return wiki_base, space
|
||||
|
||||
|
||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
|
||||
is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url
|
||||
|
||||
try:
|
||||
if is_confluence_cloud:
|
||||
wiki_base, space = _extract_confluence_keys_from_cloud_url(wiki_url)
|
||||
else:
|
||||
wiki_base, space = _extract_confluence_keys_from_datacenter_url(wiki_url)
|
||||
except Exception as e:
|
||||
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base and space names. Exception: {e}"
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
return wiki_base, space, is_confluence_cloud
|
||||
|
||||
|
||||
def _comment_dfs(
|
||||
comments_str: str,
|
||||
comment_pages: Collection[dict[str, Any]],
|
||||
@ -79,10 +111,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
wiki_page_url: str,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
|
||||
# if a page has one of the labels specified in this list, we will just
|
||||
# skip it. This is generally used to avoid indexing extra sensitive
|
||||
# pages.
|
||||
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
|
||||
) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
|
||||
self.labels_to_skip = set(labels_to_skip)
|
||||
self.wiki_base, self.space, self.is_cloud = extract_confluence_keys_from_url(
|
||||
wiki_page_url
|
||||
)
|
||||
self.confluence_client: Confluence | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
@ -90,9 +129,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
access_token = credentials["confluence_access_token"]
|
||||
self.confluence_client = Confluence(
|
||||
url=self.wiki_base,
|
||||
username=username,
|
||||
# passing in username causes issues for Confluence data center
|
||||
username=username if self.is_cloud else None,
|
||||
password=access_token,
|
||||
cloud=True,
|
||||
cloud=self.is_cloud,
|
||||
)
|
||||
return None
|
||||
|
||||
@ -185,6 +225,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
)
|
||||
return ""
|
||||
|
||||
def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
|
||||
try:
|
||||
labels_response = confluence_client.get_page_labels(page_id)
|
||||
return [label["name"] for label in labels_response["results"]]
|
||||
except Exception as e:
|
||||
if not self.continue_on_failure:
|
||||
raise e
|
||||
|
||||
logger.exception("Ran into exception when fetching labels from Confluence")
|
||||
return []
|
||||
|
||||
def _get_doc_batch(
|
||||
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
||||
) -> tuple[list[Document], int]:
|
||||
@ -200,6 +251,19 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
last_modified = datetime.fromisoformat(last_modified_str)
|
||||
|
||||
if time_filter is None or time_filter(last_modified):
|
||||
page_id = page["id"]
|
||||
|
||||
# check disallowed labels
|
||||
if self.labels_to_skip:
|
||||
page_labels = self._fetch_labels(self.confluence_client, page_id)
|
||||
label_intersection = self.labels_to_skip.intersection(page_labels)
|
||||
if label_intersection:
|
||||
logger.info(
|
||||
f"Page with ID '{page_id}' has a label which has been "
|
||||
f"designated as disallowed: {label_intersection}. Skipping."
|
||||
)
|
||||
continue
|
||||
|
||||
page_html = (
|
||||
page["body"]
|
||||
.get("storage", page["body"].get("view", {}))
|
||||
@ -212,7 +276,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
page_text = (
|
||||
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
|
||||
)
|
||||
comments_text = self._fetch_comments(self.confluence_client, page["id"])
|
||||
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
||||
page_text += comments_text
|
||||
|
||||
doc_batch.append(
|
||||
|
@ -74,8 +74,10 @@ services:
|
||||
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
|
||||
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
|
||||
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
|
||||
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
|
||||
- NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-}
|
||||
# Connector Configs
|
||||
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
|
||||
- CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-}
|
||||
# Danswer SlackBot Configs
|
||||
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
|
||||
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}
|
||||
|
@ -20,17 +20,37 @@ import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsT
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { usePublicCredentials } from "@/lib/hooks";
|
||||
|
||||
// Copied from the `extract_confluence_keys_from_url` function
|
||||
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
|
||||
if (!wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const extractSpaceFromCloudUrl = (wikiUrl: string): string => {
|
||||
const parsedUrl = new URL(wikiUrl);
|
||||
const space = parsedUrl.pathname.split("/")[3];
|
||||
return space;
|
||||
};
|
||||
|
||||
const extractSpaceFromDataCenterUrl = (wikiUrl: string): string => {
|
||||
const DISPLAY = "/display/";
|
||||
|
||||
const parsedUrl = new URL(wikiUrl);
|
||||
const spaceSpecificSection = parsedUrl.pathname
|
||||
.split(DISPLAY)
|
||||
.slice(1)
|
||||
.join(DISPLAY);
|
||||
const space = spaceSpecificSection.split("/")[0];
|
||||
return space;
|
||||
};
|
||||
|
||||
// Copied from the `extract_confluence_keys_from_url` function
|
||||
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
|
||||
try {
|
||||
if (wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
|
||||
return extractSpaceFromCloudUrl(wikiUrl);
|
||||
}
|
||||
return extractSpaceFromDataCenterUrl(wikiUrl);
|
||||
} catch (e) {
|
||||
console.log("Failed to extract space from url", e);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const Main = () => {
|
||||
const { popup, setPopup } = usePopup();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user