mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
Added BookStack connector code
Got to the point of working sync for shelves, books, chapters and pages.
This commit is contained in:
52
backend/danswer/connectors/bookstack/client.py
Normal file
52
backend/danswer/connectors/bookstack/client.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
class BookStackClientRequestFailedError(ConnectionError):
|
||||||
|
def __init__(self, status: int, error: str) -> None:
|
||||||
|
super().__init__(
|
||||||
|
"BookStack Client request failed with status {status}: {error}".format(status=status, error=error)
|
||||||
|
)
|
||||||
|
|
||||||
|
class BookStackApiClient:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
token_id: str,
|
||||||
|
token_secret: str,
|
||||||
|
) -> None:
|
||||||
|
self.base_url = base_url
|
||||||
|
self.token_id = token_id
|
||||||
|
self.token_secret = token_secret
|
||||||
|
|
||||||
|
def get(self, endpoint: str, params: dict[str, str]):
|
||||||
|
url: str = self._build_url(endpoint)
|
||||||
|
headers = self._build_headers()
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
|
||||||
|
try:
|
||||||
|
json = response.json()
|
||||||
|
except:
|
||||||
|
json = {}
|
||||||
|
pass
|
||||||
|
|
||||||
|
if response.status_code >= 300:
|
||||||
|
error = response.reason
|
||||||
|
response_error = json.get("error", {}).get("message", "")
|
||||||
|
if response_error:
|
||||||
|
error = response_error
|
||||||
|
raise BookStackClientRequestFailedError(response.status_code, error)
|
||||||
|
|
||||||
|
return json
|
||||||
|
|
||||||
|
def _build_headers(self):
|
||||||
|
auth = 'Token ' + self.token_id + ':' + self.token_secret
|
||||||
|
return {
|
||||||
|
'Authorization': auth,
|
||||||
|
'Accept': 'application/json',
|
||||||
|
}
|
||||||
|
|
||||||
|
def _build_url(self, endpoint: str):
|
||||||
|
return self.base_url.rstrip('/') + '/api/' + endpoint.lstrip('/')
|
||||||
|
|
||||||
|
def build_app_url(self, endpoint: str):
|
||||||
|
return self.base_url.rstrip('/') + '/' + endpoint.lstrip('/')
|
@@ -1,11 +1,9 @@
|
|||||||
|
import html
|
||||||
|
import time
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Generator
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from atlassian import Confluence # type:ignore
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
@@ -14,105 +12,151 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
|
|||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
from danswer.connectors.bookstack.client import BookStackApiClient
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
|
|
||||||
|
|
||||||
class BookstackClientNotSetUpError(PermissionError):
|
class BookstackClientNotSetUpError(PermissionError):
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
super().__init__(
|
super().__init__(
|
||||||
"Confluence Client is not set up, was load_credentials called?"
|
"BookStack Client is not set up, was load_credentials called?"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class BookstackConnector(LoadConnector, PollConnector):
|
class BookstackConnector(LoadConnector, PollConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
|
self.bookstack_client: BookStackApiClient | None = None
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
base_url = credentials["bookstack_base_url"]
|
self.bookstack_client = BookStackApiClient(
|
||||||
api_token_id = credentials["bookstack_api_token_id"]
|
base_url=credentials["bookstack_base_url"],
|
||||||
api_token_secret = credentials["bookstack_api_token_secret"]
|
token_id=credentials["bookstack_api_token_id"],
|
||||||
|
token_secret=credentials["bookstack_api_token_secret"],
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _get_doc_batch(
|
def _get_doc_batch(
|
||||||
self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
|
self,
|
||||||
|
endpoint: str,
|
||||||
|
transformer: Callable[[dict], Document],
|
||||||
|
start_ind: int,
|
||||||
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
) -> tuple[list[Document], int]:
|
) -> tuple[list[Document], int]:
|
||||||
doc_batch: list[Document] = []
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
if self.confluence_client is None:
|
params = {
|
||||||
raise BookstackClientNotSetUpError()
|
"count": str(self.batch_size),
|
||||||
|
"offset": str(start_ind),
|
||||||
|
"sort": "+id"
|
||||||
|
}
|
||||||
|
|
||||||
batch = self.confluence_client.get_all_pages_from_space(
|
if start:
|
||||||
self.space,
|
params["filter[updated_at:gte]"] = datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
start=start_ind,
|
|
||||||
limit=self.batch_size,
|
|
||||||
expand="body.storage.value,version",
|
|
||||||
)
|
|
||||||
|
|
||||||
for page in batch:
|
if end:
|
||||||
last_modified_str = page["version"]["when"]
|
params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S')
|
||||||
last_modified = datetime.fromisoformat(last_modified_str)
|
|
||||||
|
|
||||||
if time_filter is None or time_filter(last_modified):
|
batch = self.bookstack_client.get(endpoint, params=params).get("data", [])
|
||||||
page_html = page["body"]["storage"]["value"]
|
for item in batch:
|
||||||
soup = BeautifulSoup(page_html, "html.parser")
|
doc_batch.append(transformer(item))
|
||||||
page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR)
|
|
||||||
comment_pages = self.confluence_client.get_page_child_by_type(
|
|
||||||
page["id"],
|
|
||||||
type="comment",
|
|
||||||
start=None,
|
|
||||||
limit=None,
|
|
||||||
expand="body.storage.value",
|
|
||||||
)
|
|
||||||
comments_text = _comment_dfs("", comment_pages, self.confluence_client)
|
|
||||||
page_text += comments_text
|
|
||||||
|
|
||||||
page_url = self.wiki_base + page["_links"]["webui"]
|
|
||||||
|
|
||||||
doc_batch.append(
|
|
||||||
Document(
|
|
||||||
id=page_url,
|
|
||||||
sections=[Section(link=page_url, text=page_text)],
|
|
||||||
source=DocumentSource.CONFLUENCE,
|
|
||||||
semantic_identifier=page["title"],
|
|
||||||
metadata={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return doc_batch, len(batch)
|
return doc_batch, len(batch)
|
||||||
|
|
||||||
|
def _book_to_document(self, book: dict):
|
||||||
|
url = self.bookstack_client.build_app_url("/books/" + book.get("slug"))
|
||||||
|
text = book.get("name", "") + "\n" + book.get("description", "")
|
||||||
|
return Document(
|
||||||
|
id=url,
|
||||||
|
sections=[Section(link=url, text=text)],
|
||||||
|
source=DocumentSource.BOOKSTACK,
|
||||||
|
semantic_identifier="Book: " + book.get("name"),
|
||||||
|
metadata={
|
||||||
|
"type": "book",
|
||||||
|
"updated_at": book.get("updated_at")
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _chapter_to_document(self, chapter: dict):
|
||||||
|
url = self.bookstack_client.build_app_url("/books/" + chapter.get("book_slug") + "/chapter/" + chapter.get("slug"))
|
||||||
|
text = chapter.get("name", "") + "\n" + chapter.get("description", "")
|
||||||
|
return Document(
|
||||||
|
id=url,
|
||||||
|
sections=[Section(link=url, text=text)],
|
||||||
|
source=DocumentSource.BOOKSTACK,
|
||||||
|
semantic_identifier="Chapter: " + chapter.get("name"),
|
||||||
|
metadata={
|
||||||
|
"type": "chapter",
|
||||||
|
"updated_at": chapter.get("updated_at")
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _shelf_to_document(self, shelf: dict):
|
||||||
|
url = self.bookstack_client.build_app_url("/shelves/" + shelf.get("slug"))
|
||||||
|
text = shelf.get("name", "") + "\n" + shelf.get("description", "")
|
||||||
|
return Document(
|
||||||
|
id=url,
|
||||||
|
sections=[Section(link=url, text=text)],
|
||||||
|
source=DocumentSource.BOOKSTACK,
|
||||||
|
semantic_identifier="Shelf: " + shelf.get("name"),
|
||||||
|
metadata={
|
||||||
|
"type": "shelf",
|
||||||
|
"updated_at": shelf.get("updated_at")
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _page_to_document(self, page: dict):
|
||||||
|
page_id = str(page.get("id"))
|
||||||
|
page_data = self.bookstack_client.get("/pages/" + page_id, {})
|
||||||
|
url = self.bookstack_client.build_app_url("/books/" + page.get("book_slug") + "/page/" + page_data.get("slug"))
|
||||||
|
page_html = "<h1>" + html.escape(page_data.get("name")) + "</h1>" + page_data.get("html")
|
||||||
|
soup = BeautifulSoup(page_html, "html.parser")
|
||||||
|
text = soup.get_text(HTML_SEPARATOR)
|
||||||
|
time.sleep(0.1)
|
||||||
|
return Document(
|
||||||
|
id=url,
|
||||||
|
sections=[Section(link=url, text=text)],
|
||||||
|
source=DocumentSource.BOOKSTACK,
|
||||||
|
semantic_identifier="Page: " + page_data.get("name"),
|
||||||
|
metadata={
|
||||||
|
"type": "page",
|
||||||
|
"updated_at": page_data.get("updated_at")
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
if self.confluence_client is None:
|
if self.bookstack_client is None:
|
||||||
raise BookstackClientNotSetUpError()
|
raise BookstackClientNotSetUpError()
|
||||||
|
|
||||||
start_ind = 0
|
return self.poll_source(None, None)
|
||||||
while True:
|
|
||||||
doc_batch, num_pages = self._get_doc_batch(start_ind)
|
|
||||||
start_ind += num_pages
|
|
||||||
if doc_batch:
|
|
||||||
yield doc_batch
|
|
||||||
|
|
||||||
if num_pages < self.batch_size:
|
|
||||||
break
|
|
||||||
|
|
||||||
def poll_source(
|
def poll_source(
|
||||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||||
) -> GenerateDocumentsOutput:
|
) -> GenerateDocumentsOutput:
|
||||||
if self.confluence_client is None:
|
if self.bookstack_client is None:
|
||||||
raise BookstackClientNotSetUpError()
|
raise BookstackClientNotSetUpError()
|
||||||
|
|
||||||
start_time = datetime.fromtimestamp(start, tz=timezone.utc)
|
transform_by_endpoint: dict[str, Callable[[dict], Document]] = {
|
||||||
end_time = datetime.fromtimestamp(end, tz=timezone.utc)
|
"/books": self._book_to_document,
|
||||||
|
"/chapters": self._chapter_to_document,
|
||||||
|
"/shelves": self._shelf_to_document,
|
||||||
|
"/pages": self._page_to_document,
|
||||||
|
}
|
||||||
|
|
||||||
|
for endpoint, transform in transform_by_endpoint.items():
|
||||||
start_ind = 0
|
start_ind = 0
|
||||||
while True:
|
while True:
|
||||||
doc_batch, num_pages = self._get_doc_batch(
|
doc_batch, num_results = self._get_doc_batch(endpoint, transform, start_ind, start, end)
|
||||||
start_ind, time_filter=lambda t: start_time <= t <= end_time
|
start_ind += num_results
|
||||||
)
|
|
||||||
start_ind += num_pages
|
|
||||||
if doc_batch:
|
if doc_batch:
|
||||||
yield doc_batch
|
yield doc_batch
|
||||||
|
|
||||||
if num_pages < self.batch_size:
|
if num_results < self.batch_size:
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
time.sleep(0.2)
|
||||||
|
@@ -9,7 +9,7 @@ import {
|
|||||||
BookstackCredentialJson,
|
BookstackCredentialJson,
|
||||||
BookstackConfig,
|
BookstackConfig,
|
||||||
Credential,
|
Credential,
|
||||||
ConnectorIndexingStatus, ConfluenceConfig,
|
ConnectorIndexingStatus,
|
||||||
} from "@/lib/types";
|
} from "@/lib/types";
|
||||||
import useSWR, { useSWRConfig } from "swr";
|
import useSWR, { useSWRConfig } from "swr";
|
||||||
import { fetcher } from "@/lib/fetcher";
|
import { fetcher } from "@/lib/fetcher";
|
||||||
|
Reference in New Issue
Block a user