Support Confluence data center + allow for specifying labels to ignore (#624)

This commit is contained in:
Chris Weaver 2023-10-24 17:40:42 -07:00 committed by GitHub
parent 17bd68be4c
commit ef2b445201
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 114 additions and 18 deletions

View File

@ -109,9 +109,11 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
#####
GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
)
# TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
@ -128,6 +130,14 @@ NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
== "true"
)
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
ignored_tag
for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
","
)
if ignored_tag
]
#####
# Query Configs
#####

View File

@ -9,6 +9,7 @@ from urllib.parse import urlparse
from atlassian import Confluence # type:ignore
from requests import HTTPError
from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
@ -30,17 +31,12 @@ logger = setup_logger()
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str]:
"""Sample
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
wiki_base is danswer.atlassian.net/wiki
wiki_base is https://danswer.atlassian.net/wiki
space is 1234abcd
"""
if ".atlassian.net/wiki/spaces/" not in wiki_url:
raise ValueError(
"Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
)
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
@ -52,6 +48,42 @@ def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
return wiki_base, space
def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str]:
"""Sample
https://danswer.ai/confluence/display/1234abcd/overview
wiki_base is https://danswer.ai/confluence
space is 1234abcd
"""
# /display/ is always right before the space and at the end of the base url
DISPLAY = "/display/"
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split(DISPLAY)[0]
)
space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
return wiki_base, space
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, bool]:
is_confluence_cloud = ".atlassian.net/wiki/spaces/" in wiki_url
try:
if is_confluence_cloud:
wiki_base, space = _extract_confluence_keys_from_cloud_url(wiki_url)
else:
wiki_base, space = _extract_confluence_keys_from_datacenter_url(wiki_url)
except Exception as e:
error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base and space names. Exception: {e}"
logger.error(error_msg)
raise ValueError(error_msg)
return wiki_base, space, is_confluence_cloud
def _comment_dfs(
comments_str: str,
comment_pages: Collection[dict[str, Any]],
@ -79,10 +111,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
wiki_page_url: str,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
# if a page has one of the labels specified in this list, we will just
# skip it. This is generally used to avoid indexing extra sensitive
# pages.
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
) -> None:
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
self.labels_to_skip = set(labels_to_skip)
self.wiki_base, self.space, self.is_cloud = extract_confluence_keys_from_url(
wiki_page_url
)
self.confluence_client: Confluence | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@ -90,9 +129,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
access_token = credentials["confluence_access_token"]
self.confluence_client = Confluence(
url=self.wiki_base,
username=username,
# passing in username causes issues for Confluence data center
username=username if self.is_cloud else None,
password=access_token,
cloud=True,
cloud=self.is_cloud,
)
return None
@ -185,6 +225,17 @@ class ConfluenceConnector(LoadConnector, PollConnector):
)
return ""
def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
try:
labels_response = confluence_client.get_page_labels(page_id)
return [label["name"] for label in labels_response["results"]]
except Exception as e:
if not self.continue_on_failure:
raise e
logger.exception("Ran into exception when fetching labels from Confluence")
return []
def _get_doc_batch(
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
) -> tuple[list[Document], int]:
@ -200,6 +251,19 @@ class ConfluenceConnector(LoadConnector, PollConnector):
last_modified = datetime.fromisoformat(last_modified_str)
if time_filter is None or time_filter(last_modified):
page_id = page["id"]
# check disallowed labels
if self.labels_to_skip:
page_labels = self._fetch_labels(self.confluence_client, page_id)
label_intersection = self.labels_to_skip.intersection(page_labels)
if label_intersection:
logger.info(
f"Page with ID '{page_id}' has a label which has been "
f"designated as disallowed: {label_intersection}. Skipping."
)
continue
page_html = (
page["body"]
.get("storage", page["body"].get("view", {}))
@ -212,7 +276,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
page_text = (
page.get("title", "") + "\n" + parse_html_page_basic(page_html)
)
comments_text = self._fetch_comments(self.confluence_client, page["id"])
comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text
doc_batch.append(

View File

@ -74,8 +74,10 @@ services:
- API_TYPE_OPENAI=${API_TYPE_OPENAI:-}
- API_VERSION_OPENAI=${API_VERSION_OPENAI:-}
- AZURE_DEPLOYMENT_ID=${AZURE_DEPLOYMENT_ID:-}
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
- NUM_INDEXING_WORKERS=${NUM_INDEXING_WORKERS:-}
# Connector Configs
- CONTINUE_ON_CONNECTOR_FAILURE=${CONTINUE_ON_CONNECTOR_FAILURE:-}
- CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-}
# Danswer SlackBot Configs
- DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-}
- DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}

View File

@ -20,17 +20,37 @@ import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsT
import { usePopup } from "@/components/admin/connectors/Popup";
import { usePublicCredentials } from "@/lib/hooks";
// Copied from the `extract_confluence_keys_from_url` function
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
if (!wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
return null;
}
const extractSpaceFromCloudUrl = (wikiUrl: string): string => {
const parsedUrl = new URL(wikiUrl);
const space = parsedUrl.pathname.split("/")[3];
return space;
};
const extractSpaceFromDataCenterUrl = (wikiUrl: string): string => {
const DISPLAY = "/display/";
const parsedUrl = new URL(wikiUrl);
const spaceSpecificSection = parsedUrl.pathname
.split(DISPLAY)
.slice(1)
.join(DISPLAY);
const space = spaceSpecificSection.split("/")[0];
return space;
};
// Copied from the `extract_confluence_keys_from_url` function
const extractSpaceFromUrl = (wikiUrl: string): string | null => {
try {
if (wikiUrl.includes(".atlassian.net/wiki/spaces/")) {
return extractSpaceFromCloudUrl(wikiUrl);
}
return extractSpaceFromDataCenterUrl(wikiUrl);
} catch (e) {
console.log("Failed to extract space from url", e);
return null;
}
};
const Main = () => {
const { popup, setPopup } = usePopup();