DAN-19 Confluence Connector Backend for Public Docs (#73)

By public we mean if there is an admin account with an API key that has it setup in Danswer. Means just no support for OAuth for individual users to add docs.
This commit is contained in:
Yuhong Sun 2023-05-21 13:27:37 -07:00 committed by GitHub
parent 7559ba6e9d
commit 6d7e7d5b71
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 151 additions and 3 deletions

View File

@ -82,6 +82,11 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
# example: username@companyemail.com
CONFLUENCE_USERNAME = os.environ.get("CONFLUENCE_USERNAME", "")
# https://id.atlassian.com/manage-profile/security/api-tokens
CONFLUENCE_ACCESS_TOKEN = os.environ.get("CONFLUENCE_ACCESS_TOKEN", "")
#####
# Query Configs

View File

@ -13,6 +13,7 @@ ALLOWED_USERS = "allowed_users"
ALLOWED_GROUPS = "allowed_groups"
NO_AUTH_USER = "FooBarUser" # TODO rework this temporary solution
OPENAI_API_KEY_STORAGE_KEY = "openai_api_key"
HTML_SEPARATOR = "\n"
class DocumentSource(str, Enum):
@ -20,3 +21,4 @@ class DocumentSource(str, Enum):
WEB = "web"
GOOGLE_DRIVE = "google_drive"
GITHUB = "github"
CONFLUENCE = "confluence"

View File

@ -0,0 +1,119 @@
from collections.abc import Generator
from typing import Any
from urllib.parse import urlparse
from atlassian import Confluence # type:ignore
from bs4 import BeautifulSoup
from danswer.configs.app_configs import CONFLUENCE_ACCESS_TOKEN
from danswer.configs.app_configs import CONFLUENCE_USERNAME
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import HTML_SEPARATOR
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
# Potential Improvements
# 1. If wiki page instead of space, do a search of all the children of the page instead of index all in the space
# 2. Include attachments, etc
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
"""Sample
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
wiki_base is danswer.atlassian.net/wiki
space is 1234abcd
"""
if ".atlassian.net/wiki/spaces/" not in wiki_url:
raise ValueError(
"Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
)
parsed_url = urlparse(wiki_url)
wiki_base = (
parsed_url.scheme
+ "://"
+ parsed_url.netloc
+ parsed_url.path.split("/spaces")[0]
)
space = parsed_url.path.split("/")[3]
return wiki_base, space
class ConfluenceConnector(LoadConnector):
def __init__(
self,
wiki_page_url: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.batch_size = batch_size
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
self.confluence_client = Confluence(
url=self.wiki_base,
username=CONFLUENCE_USERNAME,
password=CONFLUENCE_ACCESS_TOKEN,
cloud=True,
)
def _comment_dfs(
self, comments_str: str, comment_pages: Generator[dict[str, Any], None, None]
) -> str:
for comment_page in comment_pages:
comment_html = comment_page["body"]["storage"]["value"]
soup = BeautifulSoup(comment_html, "html.parser")
comments_str += "\nComment:\n" + soup.get_text(HTML_SEPARATOR)
child_comment_pages = self.confluence_client.get_page_child_by_type(
comment_page["id"],
type="comment",
start=None,
limit=None,
expand="body.storage.value",
)
comments_str = self._comment_dfs(comments_str, child_comment_pages)
return comments_str
def load_from_state(self) -> Generator[list[Document], None, None]:
start_ind = 0
while True:
doc_batch: list[Document] = []
batch = self.confluence_client.get_all_pages_from_space(
self.space,
start=start_ind,
limit=self.batch_size,
expand="body.storage.value",
)
for page in batch:
page_html = page["body"]["storage"]["value"]
soup = BeautifulSoup(page_html, "html.parser")
page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR)
comment_pages = self.confluence_client.get_page_child_by_type(
page["id"],
type="comment",
start=None,
limit=None,
expand="body.storage.value",
)
comments_text = self._comment_dfs("", comment_pages)
page_text += comments_text
page_url = self.wiki_base + page["_links"]["webui"]
doc_batch.append(
Document(
id=page_url,
sections=[Section(link=page_url, text=page_text)],
source=DocumentSource.CONFLUENCE,
semantic_identifier=page["title"],
metadata={},
)
)
yield doc_batch
start_ind += len(batch)
if len(batch) < self.batch_size:
break
if doc_batch:
yield doc_batch

View File

@ -3,6 +3,7 @@ from collections.abc import Generator
from typing import Any
from danswer.configs.constants import DocumentSource
from danswer.connectors.confluence.connector import ConfluenceConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.google_drive.connector import GoogleDriveConnector
from danswer.connectors.interfaces import BaseConnector
@ -34,6 +35,8 @@ def build_connector(
connector = GithubConnector(**connector_specific_config)
elif source == DocumentSource.WEB:
connector = WebConnector(**connector_specific_config)
elif source == DocumentSource.CONFLUENCE:
connector = ConfluenceConnector(**connector_specific_config)
else:
raise ConnectorMissingException(f"Connector not found for source={source}")

View File

@ -9,6 +9,7 @@ import requests
from bs4 import BeautifulSoup
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import HTML_SEPARATOR
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
@ -17,7 +18,6 @@ from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
logger = setup_logger()
TAG_SEPARATOR = "\n"
def is_valid_url(url: str) -> bool:
@ -138,7 +138,7 @@ class WebConnector(LoadConnector):
]:
[tag.extract() for tag in soup.find_all(undesired_tag)]
page_text = soup.get_text(TAG_SEPARATOR)
page_text = soup.get_text(HTML_SEPARATOR)
doc_batch.append(
Document(

View File

@ -1,5 +1,6 @@
alembic==1.10.4
asyncpg==0.27.0
atlassian-python-api==3.37.0
beautifulsoup4==4.12.0
fastapi==0.95.0
fastapi-users==11.0.0

View File

@ -1,3 +1,4 @@
# This file is only for development purposes
import argparse
from itertools import chain
@ -5,6 +6,7 @@ from danswer.chunking.chunk import Chunker
from danswer.chunking.chunk import DefaultChunker
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
from danswer.connectors.confluence.connector import ConfluenceConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.google_drive.connector import GoogleDriveConnector
from danswer.connectors.google_drive.connector_auth import backend_get_credentials
@ -91,11 +93,22 @@ def load_github_batch(owner: str, repo: str, qdrant_collection: str) -> None:
)
def load_confluence_batch(confluence_wiki_url: str, qdrant_collection: str) -> None:
logger.info("Loading documents from Confluence.")
load_batch(
ConfluenceConnector(confluence_wiki_url, batch_size=INDEX_BATCH_SIZE),
DefaultChunker(),
DefaultEmbedder(),
QdrantDatastore(collection=qdrant_collection),
)
class BatchLoadingArgs(argparse.Namespace):
website_url: str
github_owner: str
github_repo: str
slack_export_dir: str
confluence_link: str
qdrant_collection: str
rebuild_index: bool
@ -118,6 +131,10 @@ if __name__ == "__main__":
"--slack-export-dir",
default="~/Downloads/test-slack-export",
)
parser.add_argument(
"--confluence_link",
default="https://danswer.atlassian.net/wiki/spaces/fakespace",
)
parser.add_argument(
"--qdrant-collection",
default=QDRANT_DEFAULT_COLLECTION,
@ -133,6 +150,7 @@ if __name__ == "__main__":
recreate_collection(args.qdrant_collection)
# load_slack_batch(args.slack_export_dir, args.qdrant_collection)
load_web_batch(args.website_url, args.qdrant_collection)
# load_web_batch(args.website_url, args.qdrant_collection)
# load_google_drive_batch(args.qdrant_collection)
# load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)
load_confluence_batch(args.confluence_link, args.qdrant_collection)