mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-06 09:50:27 +02:00
DAN-19 Confluence Connector Backend for Public Docs (#73)
By public we mean if there is an admin account with an API key that has it setup in Danswer. Means just no support for OAuth for individual users to add docs.
This commit is contained in:
parent
7559ba6e9d
commit
6d7e7d5b71
@ -82,6 +82,11 @@ GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
|
||||
GITHUB_ACCESS_TOKEN = os.environ.get("GITHUB_ACCESS_TOKEN", "")
|
||||
|
||||
# example: username@companyemail.com
|
||||
CONFLUENCE_USERNAME = os.environ.get("CONFLUENCE_USERNAME", "")
|
||||
# https://id.atlassian.com/manage-profile/security/api-tokens
|
||||
CONFLUENCE_ACCESS_TOKEN = os.environ.get("CONFLUENCE_ACCESS_TOKEN", "")
|
||||
|
||||
|
||||
#####
|
||||
# Query Configs
|
||||
|
@ -13,6 +13,7 @@ ALLOWED_USERS = "allowed_users"
|
||||
ALLOWED_GROUPS = "allowed_groups"
|
||||
NO_AUTH_USER = "FooBarUser" # TODO rework this temporary solution
|
||||
OPENAI_API_KEY_STORAGE_KEY = "openai_api_key"
|
||||
HTML_SEPARATOR = "\n"
|
||||
|
||||
|
||||
class DocumentSource(str, Enum):
|
||||
@ -20,3 +21,4 @@ class DocumentSource(str, Enum):
|
||||
WEB = "web"
|
||||
GOOGLE_DRIVE = "google_drive"
|
||||
GITHUB = "github"
|
||||
CONFLUENCE = "confluence"
|
||||
|
0
backend/danswer/connectors/confluence/__init__.py
Normal file
0
backend/danswer/connectors/confluence/__init__.py
Normal file
119
backend/danswer/connectors/confluence/connector.py
Normal file
119
backend/danswer/connectors/confluence/connector.py
Normal file
@ -0,0 +1,119 @@
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from atlassian import Confluence # type:ignore
|
||||
from bs4 import BeautifulSoup
|
||||
from danswer.configs.app_configs import CONFLUENCE_ACCESS_TOKEN
|
||||
from danswer.configs.app_configs import CONFLUENCE_USERNAME
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import HTML_SEPARATOR
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
|
||||
# Potential Improvements
|
||||
# 1. If wiki page instead of space, do a search of all the children of the page instead of index all in the space
|
||||
# 2. Include attachments, etc
|
||||
# 3. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
||||
|
||||
|
||||
def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str]:
|
||||
"""Sample
|
||||
https://danswer.atlassian.net/wiki/spaces/1234abcd/overview
|
||||
wiki_base is danswer.atlassian.net/wiki
|
||||
space is 1234abcd
|
||||
"""
|
||||
if ".atlassian.net/wiki/spaces/" not in wiki_url:
|
||||
raise ValueError(
|
||||
"Not a valid Confluence Wiki Link, unable to extract wiki base and space names"
|
||||
)
|
||||
|
||||
parsed_url = urlparse(wiki_url)
|
||||
wiki_base = (
|
||||
parsed_url.scheme
|
||||
+ "://"
|
||||
+ parsed_url.netloc
|
||||
+ parsed_url.path.split("/spaces")[0]
|
||||
)
|
||||
space = parsed_url.path.split("/")[3]
|
||||
return wiki_base, space
|
||||
|
||||
|
||||
class ConfluenceConnector(LoadConnector):
|
||||
def __init__(
|
||||
self,
|
||||
wiki_page_url: str,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.wiki_base, self.space = extract_confluence_keys_from_url(wiki_page_url)
|
||||
self.confluence_client = Confluence(
|
||||
url=self.wiki_base,
|
||||
username=CONFLUENCE_USERNAME,
|
||||
password=CONFLUENCE_ACCESS_TOKEN,
|
||||
cloud=True,
|
||||
)
|
||||
|
||||
def _comment_dfs(
|
||||
self, comments_str: str, comment_pages: Generator[dict[str, Any], None, None]
|
||||
) -> str:
|
||||
for comment_page in comment_pages:
|
||||
comment_html = comment_page["body"]["storage"]["value"]
|
||||
soup = BeautifulSoup(comment_html, "html.parser")
|
||||
comments_str += "\nComment:\n" + soup.get_text(HTML_SEPARATOR)
|
||||
child_comment_pages = self.confluence_client.get_page_child_by_type(
|
||||
comment_page["id"],
|
||||
type="comment",
|
||||
start=None,
|
||||
limit=None,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
comments_str = self._comment_dfs(comments_str, child_comment_pages)
|
||||
return comments_str
|
||||
|
||||
def load_from_state(self) -> Generator[list[Document], None, None]:
|
||||
start_ind = 0
|
||||
while True:
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
batch = self.confluence_client.get_all_pages_from_space(
|
||||
self.space,
|
||||
start=start_ind,
|
||||
limit=self.batch_size,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
|
||||
for page in batch:
|
||||
page_html = page["body"]["storage"]["value"]
|
||||
soup = BeautifulSoup(page_html, "html.parser")
|
||||
page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR)
|
||||
comment_pages = self.confluence_client.get_page_child_by_type(
|
||||
page["id"],
|
||||
type="comment",
|
||||
start=None,
|
||||
limit=None,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
comments_text = self._comment_dfs("", comment_pages)
|
||||
page_text += comments_text
|
||||
|
||||
page_url = self.wiki_base + page["_links"]["webui"]
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=page_url,
|
||||
sections=[Section(link=page_url, text=page_text)],
|
||||
source=DocumentSource.CONFLUENCE,
|
||||
semantic_identifier=page["title"],
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
yield doc_batch
|
||||
|
||||
start_ind += len(batch)
|
||||
if len(batch) < self.batch_size:
|
||||
break
|
||||
if doc_batch:
|
||||
yield doc_batch
|
@ -3,6 +3,7 @@ from collections.abc import Generator
|
||||
from typing import Any
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.confluence.connector import ConfluenceConnector
|
||||
from danswer.connectors.github.connector import GithubConnector
|
||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from danswer.connectors.interfaces import BaseConnector
|
||||
@ -34,6 +35,8 @@ def build_connector(
|
||||
connector = GithubConnector(**connector_specific_config)
|
||||
elif source == DocumentSource.WEB:
|
||||
connector = WebConnector(**connector_specific_config)
|
||||
elif source == DocumentSource.CONFLUENCE:
|
||||
connector = ConfluenceConnector(**connector_specific_config)
|
||||
else:
|
||||
raise ConnectorMissingException(f"Connector not found for source={source}")
|
||||
|
||||
|
@ -9,6 +9,7 @@ import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import HTML_SEPARATOR
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
@ -17,7 +18,6 @@ from playwright.sync_api import sync_playwright
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
logger = setup_logger()
|
||||
TAG_SEPARATOR = "\n"
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
@ -138,7 +138,7 @@ class WebConnector(LoadConnector):
|
||||
]:
|
||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||
|
||||
page_text = soup.get_text(TAG_SEPARATOR)
|
||||
page_text = soup.get_text(HTML_SEPARATOR)
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
|
@ -1,5 +1,6 @@
|
||||
alembic==1.10.4
|
||||
asyncpg==0.27.0
|
||||
atlassian-python-api==3.37.0
|
||||
beautifulsoup4==4.12.0
|
||||
fastapi==0.95.0
|
||||
fastapi-users==11.0.0
|
||||
|
@ -1,3 +1,4 @@
|
||||
# This file is only for development purposes
|
||||
import argparse
|
||||
from itertools import chain
|
||||
|
||||
@ -5,6 +6,7 @@ from danswer.chunking.chunk import Chunker
|
||||
from danswer.chunking.chunk import DefaultChunker
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
|
||||
from danswer.connectors.confluence.connector import ConfluenceConnector
|
||||
from danswer.connectors.github.connector import GithubConnector
|
||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from danswer.connectors.google_drive.connector_auth import backend_get_credentials
|
||||
@ -91,11 +93,22 @@ def load_github_batch(owner: str, repo: str, qdrant_collection: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def load_confluence_batch(confluence_wiki_url: str, qdrant_collection: str) -> None:
|
||||
logger.info("Loading documents from Confluence.")
|
||||
load_batch(
|
||||
ConfluenceConnector(confluence_wiki_url, batch_size=INDEX_BATCH_SIZE),
|
||||
DefaultChunker(),
|
||||
DefaultEmbedder(),
|
||||
QdrantDatastore(collection=qdrant_collection),
|
||||
)
|
||||
|
||||
|
||||
class BatchLoadingArgs(argparse.Namespace):
|
||||
website_url: str
|
||||
github_owner: str
|
||||
github_repo: str
|
||||
slack_export_dir: str
|
||||
confluence_link: str
|
||||
qdrant_collection: str
|
||||
rebuild_index: bool
|
||||
|
||||
@ -118,6 +131,10 @@ if __name__ == "__main__":
|
||||
"--slack-export-dir",
|
||||
default="~/Downloads/test-slack-export",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--confluence_link",
|
||||
default="https://danswer.atlassian.net/wiki/spaces/fakespace",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qdrant-collection",
|
||||
default=QDRANT_DEFAULT_COLLECTION,
|
||||
@ -133,6 +150,7 @@ if __name__ == "__main__":
|
||||
recreate_collection(args.qdrant_collection)
|
||||
|
||||
# load_slack_batch(args.slack_export_dir, args.qdrant_collection)
|
||||
load_web_batch(args.website_url, args.qdrant_collection)
|
||||
# load_web_batch(args.website_url, args.qdrant_collection)
|
||||
# load_google_drive_batch(args.qdrant_collection)
|
||||
# load_github_batch(args.github_owner, args.github_repo, args.qdrant_collection)
|
||||
load_confluence_batch(args.confluence_link, args.qdrant_collection)
|
||||
|
Loading…
x
Reference in New Issue
Block a user