From 44f905ef80a6b254f050a0d473af5fab6f5160f1 Mon Sep 17 00:00:00 2001 From: Dan Brown Date: Thu, 6 Jul 2023 14:56:28 +0100 Subject: [PATCH] Added BookStack connector code Got to the point of working sync for shelves, books, chapters and pages. --- .../danswer/connectors/bookstack/client.py | 52 +++++ .../danswer/connectors/bookstack/connector.py | 180 +++++++++++------- .../app/admin/connectors/bookstack/page.tsx | 2 +- 3 files changed, 165 insertions(+), 69 deletions(-) create mode 100644 backend/danswer/connectors/bookstack/client.py diff --git a/backend/danswer/connectors/bookstack/client.py b/backend/danswer/connectors/bookstack/client.py new file mode 100644 index 000000000000..7cc38427f5c8 --- /dev/null +++ b/backend/danswer/connectors/bookstack/client.py @@ -0,0 +1,52 @@ +import requests + +class BookStackClientRequestFailedError(ConnectionError): + def __init__(self, status: int, error: str) -> None: + super().__init__( + "BookStack Client request failed with status {status}: {error}".format(status=status, error=error) + ) + +class BookStackApiClient: + + def __init__( + self, + base_url: str, + token_id: str, + token_secret: str, + ) -> None: + self.base_url = base_url + self.token_id = token_id + self.token_secret = token_secret + + def get(self, endpoint: str, params: dict[str, str]): + url: str = self._build_url(endpoint) + headers = self._build_headers() + response = requests.get(url, headers=headers, params=params) + + try: + json = response.json() + except: + json = {} + pass + + if response.status_code >= 300: + error = response.reason + response_error = json.get("error", {}).get("message", "") + if response_error: + error = response_error + raise BookStackClientRequestFailedError(response.status_code, error) + + return json + + def _build_headers(self): + auth = 'Token ' + self.token_id + ':' + self.token_secret + return { + 'Authorization': auth, + 'Accept': 'application/json', + } + + def _build_url(self, endpoint: str): + return self.base_url.rstrip('/') + '/api/' + endpoint.lstrip('/') + + def build_app_url(self, endpoint: str): + return self.base_url.rstrip('/') + '/' + endpoint.lstrip('/') diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index 9a6dee65f1f4..bab007875010 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -1,11 +1,9 @@ +import html +import time from collections.abc import Callable -from collections.abc import Generator from datetime import datetime -from datetime import timezone from typing import Any -from urllib.parse import urlparse -from atlassian import Confluence # type:ignore from bs4 import BeautifulSoup from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -14,105 +12,151 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.bookstack.client import BookStackApiClient from danswer.connectors.models import Document from danswer.connectors.models import Section + class BookstackClientNotSetUpError(PermissionError): def __init__(self) -> None: super().__init__( - "Confluence Client is not set up, was load_credentials called?" + "BookStack Client is not set up, was load_credentials called?" ) + class BookstackConnector(LoadConnector, PollConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.batch_size = batch_size + self.bookstack_client: BookStackApiClient | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - base_url = credentials["bookstack_base_url"] - api_token_id = credentials["bookstack_api_token_id"] - api_token_secret = credentials["bookstack_api_token_secret"] + self.bookstack_client = BookStackApiClient( + base_url=credentials["bookstack_base_url"], + token_id=credentials["bookstack_api_token_id"], + token_secret=credentials["bookstack_api_token_secret"], + ) return None def _get_doc_batch( - self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None + self, + endpoint: str, + transformer: Callable[[dict], Document], + start_ind: int, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, ) -> tuple[list[Document], int]: doc_batch: list[Document] = [] - if self.confluence_client is None: - raise BookstackClientNotSetUpError() + params = { + "count": str(self.batch_size), + "offset": str(start_ind), + "sort": "+id" + } - batch = self.confluence_client.get_all_pages_from_space( - self.space, - start=start_ind, - limit=self.batch_size, - expand="body.storage.value,version", - ) + if start: + params["filter[updated_at:gte]"] = datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S') - for page in batch: - last_modified_str = page["version"]["when"] - last_modified = datetime.fromisoformat(last_modified_str) + if end: + params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S') - if time_filter is None or time_filter(last_modified): - page_html = page["body"]["storage"]["value"] - soup = BeautifulSoup(page_html, "html.parser") - page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR) - comment_pages = self.confluence_client.get_page_child_by_type( - page["id"], - type="comment", - start=None, - limit=None, - expand="body.storage.value", - ) - comments_text = _comment_dfs("", comment_pages, self.confluence_client) - page_text += comments_text + batch = self.bookstack_client.get(endpoint, params=params).get("data", []) + for item in batch: + doc_batch.append(transformer(item)) - page_url = self.wiki_base + page["_links"]["webui"] - - doc_batch.append( - Document( - id=page_url, - sections=[Section(link=page_url, text=page_text)], - source=DocumentSource.CONFLUENCE, - semantic_identifier=page["title"], - metadata={}, - ) - ) return doc_batch, len(batch) + def _book_to_document(self, book: dict): + url = self.bookstack_client.build_app_url("/books/" + book.get("slug")) + text = book.get("name", "") + "\n" + book.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Book: " + book.get("name"), + metadata={ + "type": "book", + "updated_at": book.get("updated_at") + }, + ) + + def _chapter_to_document(self, chapter: dict): + url = self.bookstack_client.build_app_url("/books/" + chapter.get("book_slug") + "/chapter/" + chapter.get("slug")) + text = chapter.get("name", "") + "\n" + chapter.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Chapter: " + chapter.get("name"), + metadata={ + "type": "chapter", + "updated_at": chapter.get("updated_at") + }, + ) + + def _shelf_to_document(self, shelf: dict): + url = self.bookstack_client.build_app_url("/shelves/" + shelf.get("slug")) + text = shelf.get("name", "") + "\n" + shelf.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Shelf: " + shelf.get("name"), + metadata={ + "type": "shelf", + "updated_at": shelf.get("updated_at") + }, + ) + + def _page_to_document(self, page: dict): + page_id = str(page.get("id")) + page_data = self.bookstack_client.get("/pages/" + page_id, {}) + url = self.bookstack_client.build_app_url("/books/" + page.get("book_slug") + "/page/" + page_data.get("slug")) + page_html = "

" + html.escape(page_data.get("name")) + "

" + page_data.get("html") + soup = BeautifulSoup(page_html, "html.parser") + text = soup.get_text(HTML_SEPARATOR) + time.sleep(0.1) + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Page: " + page_data.get("name"), + metadata={ + "type": "page", + "updated_at": page_data.get("updated_at") + }, + ) + def load_from_state(self) -> GenerateDocumentsOutput: - if self.confluence_client is None: + if self.bookstack_client is None: raise BookstackClientNotSetUpError() - start_ind = 0 - while True: - doc_batch, num_pages = self._get_doc_batch(start_ind) - start_ind += num_pages - if doc_batch: - yield doc_batch - - if num_pages < self.batch_size: - break + return self.poll_source(None, None) def poll_source( - self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: - if self.confluence_client is None: + if self.bookstack_client is None: raise BookstackClientNotSetUpError() - start_time = datetime.fromtimestamp(start, tz=timezone.utc) - end_time = datetime.fromtimestamp(end, tz=timezone.utc) + transform_by_endpoint: dict[str, Callable[[dict], Document]] = { + "/books": self._book_to_document, + "/chapters": self._chapter_to_document, + "/shelves": self._shelf_to_document, + "/pages": self._page_to_document, + } - start_ind = 0 - while True: - doc_batch, num_pages = self._get_doc_batch( - start_ind, time_filter=lambda t: start_time <= t <= end_time - ) - start_ind += num_pages - if doc_batch: - yield doc_batch + for endpoint, transform in transform_by_endpoint.items(): + start_ind = 0 + while True: + doc_batch, num_results = self._get_doc_batch(endpoint, transform, start_ind, start, end) + start_ind += num_results + if doc_batch: + yield doc_batch - if num_pages < self.batch_size: - break + if num_results < self.batch_size: + break + else: + time.sleep(0.2) diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx index 2c38bfe6a9c5..7c730d3d7a34 100644 --- a/web/src/app/admin/connectors/bookstack/page.tsx +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -9,7 +9,7 @@ import { BookstackCredentialJson, BookstackConfig, Credential, - ConnectorIndexingStatus, ConfluenceConfig, + ConnectorIndexingStatus, } from "@/lib/types"; import useSWR, { useSWRConfig } from "swr"; import { fetcher } from "@/lib/fetcher";