From bfde5fd809894ef96c859167af0bf887503d6b92 Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 10:50:27 +0100 Subject: [PATCH 1/6] Got basic bookstack connector setup UI/backend working --- backend/danswer/configs/constants.py | 1 + .../danswer/connectors/bookstack/__init__.py | 0 .../danswer/connectors/bookstack/connector.py | 118 +++++++++ backend/danswer/connectors/factory.py | 2 + .../app/admin/connectors/bookstack/page.tsx | 243 ++++++++++++++++++ web/src/app/admin/layout.tsx | 10 + web/src/components/icons/icons.tsx | 8 + web/src/components/source.tsx | 7 + web/src/lib/types.ts | 10 + 9 files changed, 399 insertions(+) create mode 100644 backend/danswer/connectors/bookstack/__init__.py create mode 100644 backend/danswer/connectors/bookstack/connector.py create mode 100644 web/src/app/admin/connectors/bookstack/page.tsx diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index cb73f8146..9ede56eed 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -21,6 +21,7 @@ class DocumentSource(str, Enum): WEB = "web" GOOGLE_DRIVE = "google_drive" GITHUB = "github" + BOOKSTACK = "bookstack" CONFLUENCE = "confluence" SLAB = "slab" JIRA = "jira" diff --git a/backend/danswer/connectors/bookstack/__init__.py b/backend/danswer/connectors/bookstack/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py new file mode 100644 index 000000000..9a6dee65f --- /dev/null +++ b/backend/danswer/connectors/bookstack/connector.py @@ -0,0 +1,118 @@ +from collections.abc import Callable +from collections.abc import Generator +from datetime import datetime +from datetime import timezone +from typing import Any +from urllib.parse import urlparse + +from atlassian import Confluence # type:ignore +from bs4 import BeautifulSoup +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.configs.constants import HTML_SEPARATOR +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section + +class BookstackClientNotSetUpError(PermissionError): + def __init__(self) -> None: + super().__init__( + "Confluence Client is not set up, was load_credentials called?" + ) + +class BookstackConnector(LoadConnector, PollConnector): + def __init__( + self, + batch_size: int = INDEX_BATCH_SIZE, + ) -> None: + self.batch_size = batch_size + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + base_url = credentials["bookstack_base_url"] + api_token_id = credentials["bookstack_api_token_id"] + api_token_secret = credentials["bookstack_api_token_secret"] + return None + + def _get_doc_batch( + self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None + ) -> tuple[list[Document], int]: + doc_batch: list[Document] = [] + + if self.confluence_client is None: + raise BookstackClientNotSetUpError() + + batch = self.confluence_client.get_all_pages_from_space( + self.space, + start=start_ind, + limit=self.batch_size, + expand="body.storage.value,version", + ) + + for page in batch: + last_modified_str = page["version"]["when"] + last_modified = datetime.fromisoformat(last_modified_str) + + if time_filter is None or time_filter(last_modified): + page_html = page["body"]["storage"]["value"] + soup = BeautifulSoup(page_html, "html.parser") + page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR) + comment_pages = self.confluence_client.get_page_child_by_type( + page["id"], + type="comment", + start=None, + limit=None, + expand="body.storage.value", + ) + comments_text = _comment_dfs("", comment_pages, self.confluence_client) + page_text += comments_text + + page_url = self.wiki_base + page["_links"]["webui"] + + doc_batch.append( + Document( + id=page_url, + sections=[Section(link=page_url, text=page_text)], + source=DocumentSource.CONFLUENCE, + semantic_identifier=page["title"], + metadata={}, + ) + ) + return doc_batch, len(batch) + + def load_from_state(self) -> GenerateDocumentsOutput: + if self.confluence_client is None: + raise BookstackClientNotSetUpError() + + start_ind = 0 + while True: + doc_batch, num_pages = self._get_doc_batch(start_ind) + start_ind += num_pages + if doc_batch: + yield doc_batch + + if num_pages < self.batch_size: + break + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + ) -> GenerateDocumentsOutput: + if self.confluence_client is None: + raise BookstackClientNotSetUpError() + + start_time = datetime.fromtimestamp(start, tz=timezone.utc) + end_time = datetime.fromtimestamp(end, tz=timezone.utc) + + start_ind = 0 + while True: + doc_batch, num_pages = self._get_doc_batch( + start_ind, time_filter=lambda t: start_time <= t <= end_time + ) + start_ind += num_pages + if doc_batch: + yield doc_batch + + if num_pages < self.batch_size: + break diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index abacb0174..0cce12848 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -2,6 +2,7 @@ from typing import Any from typing import Type from danswer.configs.constants import DocumentSource +from danswer.connectors.bookstack.connector import BookstackConnector from danswer.connectors.confluence.connector import ConfluenceConnector from danswer.connectors.danswer_jira.connector import JiraConnector from danswer.connectors.file.connector import LocalFileConnector @@ -37,6 +38,7 @@ def identify_connector_class( }, DocumentSource.GITHUB: GithubConnector, DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector, + DocumentSource.BOOKSTACK: BookstackConnector, DocumentSource.CONFLUENCE: ConfluenceConnector, DocumentSource.JIRA: JiraConnector, DocumentSource.SLAB: SlabConnector, diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx new file mode 100644 index 000000000..2c38bfe6a --- /dev/null +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -0,0 +1,243 @@ +"use client"; + +import * as Yup from "yup"; +import { BookstackIcon, TrashIcon } from "@/components/icons/icons"; +import { TextFormField } from "@/components/admin/connectors/Field"; +import { HealthCheckBanner } from "@/components/health/healthcheck"; +import { CredentialForm } from "@/components/admin/connectors/CredentialForm"; +import { + BookstackCredentialJson, + BookstackConfig, + Credential, + ConnectorIndexingStatus, ConfluenceConfig, +} from "@/lib/types"; +import useSWR, { useSWRConfig } from "swr"; +import { fetcher } from "@/lib/fetcher"; +import { LoadingAnimation } from "@/components/Loading"; +import { deleteCredential, linkCredential } from "@/lib/credential"; +import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm"; +import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable"; +import { usePopup } from "@/components/admin/connectors/Popup"; + +const Main = () => { + const { popup, setPopup } = usePopup(); + + const { mutate } = useSWRConfig(); + const { + data: connectorIndexingStatuses, + isLoading: isConnectorIndexingStatusesLoading, + error: isConnectorIndexingStatusesError, + } = useSWR<ConnectorIndexingStatus<any>[]>( + "/api/manage/admin/connector/indexing-status", + fetcher + ); + const { + data: credentialsData, + isLoading: isCredentialsLoading, + error: isCredentialsError, + } = useSWR<Credential<BookstackCredentialJson>[]>( + "/api/manage/credential", + fetcher + ); + + if ( + (!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) || + (!credentialsData && isCredentialsLoading) + ) { + return <LoadingAnimation text="Loading" />; + } + + if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) { + return <div>Failed to load connectors</div>; + } + + if (isCredentialsError || !credentialsData) { + return <div>Failed to load credentials</div>; + } + + const bookstackConnectorIndexingStatuses = connectorIndexingStatuses.filter( + (connectorIndexingStatus) => + connectorIndexingStatus.connector.source === "bookstack" + ); + const bookstackCredential = credentialsData.filter( + (credential) => credential.credential_json?.bookstack_api_token_id + )[0]; + + return ( + <> + {popup} + <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto"> + Step 1: Provide your access token + </h2> + + {bookstackCredential ? ( + <> + <div className="flex mb-1 text-sm"> + <p className="my-auto">Existing API Token: </p> + <p className="ml-1 italic my-auto max-w-md"> + {bookstackCredential.credential_json?.bookstack_api_token_id} + </p> + <button + className="ml-1 hover:bg-gray-700 rounded-full p-1" + onClick={async () => { + if (bookstackConnectorIndexingStatuses.length > 0) { + setPopup({ + type: "error", + message: + "Must delete all connectors before deleting credentials", + }); + return; + } + await deleteCredential(bookstackCredential.id); + mutate("/api/manage/credential"); + }} + > + <TrashIcon /> + </button> + </div> + </> + ) : ( + <> + <p className="text-sm"> + To get started you'll need API token details for your BookStack instance. + You can get these by editing your (or another) user account in BookStack + and creating a token via the "API Tokens" section at the bottom. + Your user account will require to be assigned a BookStack role which + has the "Access system API" system permission assigned. + </p> + <div className="border-solid border-gray-600 border rounded-md p-6 mt-2 mb-4"> + <CredentialForm<BookstackCredentialJson> + formBody={ + <> + <TextFormField name="bookstack_base_url" label="Instance Base URL:" /> + <TextFormField name="bookstack_api_token_id" label="API Token ID:" /> + <TextFormField + name="bookstack_api_token_secret" + label="API Token Secret:" + type="password" + /> + </> + } + validationSchema={Yup.object().shape({ + bookstack_base_url: Yup.string().required( + "Please enter the base URL for your BookStack instance" + ), + bookstack_api_token_id: Yup.string().required( + "Please enter your BookStack API token ID" + ), + bookstack_api_token_secret: Yup.string().required( + "Please enter your BookStack API token secret" + ), + })} + initialValues={{ + bookstack_base_url: "", + bookstack_api_token_id: "", + bookstack_api_token_secret: "", + }} + onSubmit={(isSuccess) => { + if (isSuccess) { + mutate("/api/manage/credential"); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> + </div> + </> + )} + + {bookstackConnectorIndexingStatuses.length > 0 && ( + <> + <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto"> + BookStack indexing status + </h2> + <p className="text-sm mb-2"> + The latest page, chapter, book and shelf changes are fetched + every 10 minutes. + </p> + <div className="mb-2"> + <ConnectorsTable<BookstackConfig, BookstackCredentialJson> + connectorIndexingStatuses={ + bookstackConnectorIndexingStatuses + } + liveCredential={bookstackCredential} + getCredential={(credential) => { + return ( + <div> + <p> + {credential.credential_json.bookstack_api_token_id} + </p> + </div> + ); + }} + onCredentialLink={async (connectorId) => { + if (bookstackCredential) { + await linkCredential( + connectorId, + bookstackCredential.id + ); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + onUpdate={() => + mutate("/api/manage/admin/connector/indexing-status") + } + /> + </div> + </> + )} + + <div className="border-solid border-gray-600 border rounded-md p-6 mt-4"> + <h2 className="font-bold mb-3">Setup Connector</h2> + <ConnectorForm<BookstackConfig> + nameBuilder={(values) => + `BookStackConnector` + } + source="bookstack" + inputType="load_state" + formBody={ + <> + </> + } + validationSchema={Yup.object().shape({ + })} + initialValues={{ + }} + refreshFreq={10 * 60} // 10 minutes + onSubmit={async (isSuccess, responseJson) => { + if (isSuccess && responseJson) { + await linkCredential( + responseJson.id, + bookstackCredential.id + ); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> + </div> + + {!bookstackCredential && ( + <> + <p className="text-sm mb-4"> + Please provide your API details in Step 1 first! Once done with that, + you'll be able to see indexing status. + </p> + </> + )} + </> + ); +}; + +export default function Page() { + return ( + <div className="mx-auto container"> + <div className="mb-4"> + <HealthCheckBanner /> + </div> + <div className="border-solid border-gray-600 border-b mb-4 pb-2 flex"> + <BookstackIcon size="32" /> + <h1 className="text-3xl font-bold pl-2">BookStack</h1> + </div> + <Main /> + </div> + ); +} diff --git a/web/src/app/admin/layout.tsx b/web/src/app/admin/layout.tsx index 3a0f4c42b..e8e89db62 100644 --- a/web/src/app/admin/layout.tsx +++ b/web/src/app/admin/layout.tsx @@ -7,6 +7,7 @@ import { GoogleDriveIcon, SlackIcon, KeyIcon, + BookstackIcon, ConfluenceIcon, FileIcon, JiraIcon, @@ -83,6 +84,15 @@ export default async function AdminLayout({ ), link: "/admin/connectors/google-drive", }, + { + name: ( + <div className="flex"> + <BookstackIcon size="16" /> + <div className="ml-1">BookStack</div> + </div> + ), + link: "/admin/connectors/bookstack", + }, { name: ( <div className="flex"> diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index c6c6cdcc7..b8593b7f4 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -13,6 +13,7 @@ import { Brain, } from "@phosphor-icons/react"; import { + SiBookstack, SiConfluence, SiGithub, SiGoogledrive, @@ -113,6 +114,13 @@ export const GoogleDriveIcon = ({ return <SiGoogledrive size={size} className={className} />; }; +export const BookstackIcon = ({ + size = "16", + className = defaultTailwindCSS, +}: IconProps) => { + return <SiBookstack size={size} className={className} />; +}; + export const ConfluenceIcon = ({ size = "16", className = defaultTailwindCSS, diff --git a/web/src/components/source.tsx b/web/src/components/source.tsx index b9ef9e094..360ef6677 100644 --- a/web/src/components/source.tsx +++ b/web/src/components/source.tsx @@ -1,5 +1,6 @@ import { ValidSources } from "@/lib/types"; import { + BookstackIcon, ConfluenceIcon, FileIcon, GithubIcon, @@ -48,6 +49,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => { displayName: "Github PRs", adminPageLink: "/admin/connectors/github", }; + case "bookstack": + return { + icon: BookstackIcon, + displayName: "BookStack", + adminPageLink: "/admin/connectors/bookstack", + }; case "confluence": return { icon: ConfluenceIcon, diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 9c3453275..68274bb50 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -12,6 +12,7 @@ export type ValidSources = | "github" | "slack" | "google_drive" + | "bookstack" | "confluence" | "jira" | "slab" @@ -44,6 +45,9 @@ export interface GithubConfig { repo_name: string; } +export interface BookstackConfig { +} + export interface ConfluenceConfig { wiki_page_url: string; } @@ -90,6 +94,12 @@ export interface GithubCredentialJson { github_access_token: string; } +export interface BookstackCredentialJson { + bookstack_base_url: string; + bookstack_api_token_id: string; + bookstack_api_token_secret: string; +} + export interface ConfluenceCredentialJson { confluence_username: string; confluence_access_token: string; From 44f905ef80a6b254f050a0d473af5fab6f5160f1 Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 14:56:28 +0100 Subject: [PATCH 2/6] Added BookStack connector code Got to the point of working sync for shelves, books, chapters and pages. --- .../danswer/connectors/bookstack/client.py | 52 +++++ .../danswer/connectors/bookstack/connector.py | 180 +++++++++++------- .../app/admin/connectors/bookstack/page.tsx | 2 +- 3 files changed, 165 insertions(+), 69 deletions(-) create mode 100644 backend/danswer/connectors/bookstack/client.py diff --git a/backend/danswer/connectors/bookstack/client.py b/backend/danswer/connectors/bookstack/client.py new file mode 100644 index 000000000..7cc38427f --- /dev/null +++ b/backend/danswer/connectors/bookstack/client.py @@ -0,0 +1,52 @@ +import requests + +class BookStackClientRequestFailedError(ConnectionError): + def __init__(self, status: int, error: str) -> None: + super().__init__( + "BookStack Client request failed with status {status}: {error}".format(status=status, error=error) + ) + +class BookStackApiClient: + + def __init__( + self, + base_url: str, + token_id: str, + token_secret: str, + ) -> None: + self.base_url = base_url + self.token_id = token_id + self.token_secret = token_secret + + def get(self, endpoint: str, params: dict[str, str]): + url: str = self._build_url(endpoint) + headers = self._build_headers() + response = requests.get(url, headers=headers, params=params) + + try: + json = response.json() + except: + json = {} + pass + + if response.status_code >= 300: + error = response.reason + response_error = json.get("error", {}).get("message", "") + if response_error: + error = response_error + raise BookStackClientRequestFailedError(response.status_code, error) + + return json + + def _build_headers(self): + auth = 'Token ' + self.token_id + ':' + self.token_secret + return { + 'Authorization': auth, + 'Accept': 'application/json', + } + + def _build_url(self, endpoint: str): + return self.base_url.rstrip('/') + '/api/' + endpoint.lstrip('/') + + def build_app_url(self, endpoint: str): + return self.base_url.rstrip('/') + '/' + endpoint.lstrip('/') diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index 9a6dee65f..bab007875 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -1,11 +1,9 @@ +import html +import time from collections.abc import Callable -from collections.abc import Generator from datetime import datetime -from datetime import timezone from typing import Any -from urllib.parse import urlparse -from atlassian import Confluence # type:ignore from bs4 import BeautifulSoup from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -14,105 +12,151 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.bookstack.client import BookStackApiClient from danswer.connectors.models import Document from danswer.connectors.models import Section + class BookstackClientNotSetUpError(PermissionError): def __init__(self) -> None: super().__init__( - "Confluence Client is not set up, was load_credentials called?" + "BookStack Client is not set up, was load_credentials called?" ) + class BookstackConnector(LoadConnector, PollConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.batch_size = batch_size + self.bookstack_client: BookStackApiClient | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - base_url = credentials["bookstack_base_url"] - api_token_id = credentials["bookstack_api_token_id"] - api_token_secret = credentials["bookstack_api_token_secret"] + self.bookstack_client = BookStackApiClient( + base_url=credentials["bookstack_base_url"], + token_id=credentials["bookstack_api_token_id"], + token_secret=credentials["bookstack_api_token_secret"], + ) return None def _get_doc_batch( - self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None + self, + endpoint: str, + transformer: Callable[[dict], Document], + start_ind: int, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, ) -> tuple[list[Document], int]: doc_batch: list[Document] = [] - if self.confluence_client is None: - raise BookstackClientNotSetUpError() + params = { + "count": str(self.batch_size), + "offset": str(start_ind), + "sort": "+id" + } - batch = self.confluence_client.get_all_pages_from_space( - self.space, - start=start_ind, - limit=self.batch_size, - expand="body.storage.value,version", - ) + if start: + params["filter[updated_at:gte]"] = datetime.utcfromtimestamp(start).strftime('%Y-%m-%d %H:%M:%S') - for page in batch: - last_modified_str = page["version"]["when"] - last_modified = datetime.fromisoformat(last_modified_str) + if end: + params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime('%Y-%m-%d %H:%M:%S') - if time_filter is None or time_filter(last_modified): - page_html = page["body"]["storage"]["value"] - soup = BeautifulSoup(page_html, "html.parser") - page_text = page.get("title", "") + "\n" + soup.get_text(HTML_SEPARATOR) - comment_pages = self.confluence_client.get_page_child_by_type( - page["id"], - type="comment", - start=None, - limit=None, - expand="body.storage.value", - ) - comments_text = _comment_dfs("", comment_pages, self.confluence_client) - page_text += comments_text + batch = self.bookstack_client.get(endpoint, params=params).get("data", []) + for item in batch: + doc_batch.append(transformer(item)) - page_url = self.wiki_base + page["_links"]["webui"] - - doc_batch.append( - Document( - id=page_url, - sections=[Section(link=page_url, text=page_text)], - source=DocumentSource.CONFLUENCE, - semantic_identifier=page["title"], - metadata={}, - ) - ) return doc_batch, len(batch) + def _book_to_document(self, book: dict): + url = self.bookstack_client.build_app_url("/books/" + book.get("slug")) + text = book.get("name", "") + "\n" + book.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Book: " + book.get("name"), + metadata={ + "type": "book", + "updated_at": book.get("updated_at") + }, + ) + + def _chapter_to_document(self, chapter: dict): + url = self.bookstack_client.build_app_url("/books/" + chapter.get("book_slug") + "/chapter/" + chapter.get("slug")) + text = chapter.get("name", "") + "\n" + chapter.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Chapter: " + chapter.get("name"), + metadata={ + "type": "chapter", + "updated_at": chapter.get("updated_at") + }, + ) + + def _shelf_to_document(self, shelf: dict): + url = self.bookstack_client.build_app_url("/shelves/" + shelf.get("slug")) + text = shelf.get("name", "") + "\n" + shelf.get("description", "") + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Shelf: " + shelf.get("name"), + metadata={ + "type": "shelf", + "updated_at": shelf.get("updated_at") + }, + ) + + def _page_to_document(self, page: dict): + page_id = str(page.get("id")) + page_data = self.bookstack_client.get("/pages/" + page_id, {}) + url = self.bookstack_client.build_app_url("/books/" + page.get("book_slug") + "/page/" + page_data.get("slug")) + page_html = "<h1>" + html.escape(page_data.get("name")) + "</h1>" + page_data.get("html") + soup = BeautifulSoup(page_html, "html.parser") + text = soup.get_text(HTML_SEPARATOR) + time.sleep(0.1) + return Document( + id=url, + sections=[Section(link=url, text=text)], + source=DocumentSource.BOOKSTACK, + semantic_identifier="Page: " + page_data.get("name"), + metadata={ + "type": "page", + "updated_at": page_data.get("updated_at") + }, + ) + def load_from_state(self) -> GenerateDocumentsOutput: - if self.confluence_client is None: + if self.bookstack_client is None: raise BookstackClientNotSetUpError() - start_ind = 0 - while True: - doc_batch, num_pages = self._get_doc_batch(start_ind) - start_ind += num_pages - if doc_batch: - yield doc_batch - - if num_pages < self.batch_size: - break + return self.poll_source(None, None) def poll_source( - self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: - if self.confluence_client is None: + if self.bookstack_client is None: raise BookstackClientNotSetUpError() - start_time = datetime.fromtimestamp(start, tz=timezone.utc) - end_time = datetime.fromtimestamp(end, tz=timezone.utc) + transform_by_endpoint: dict[str, Callable[[dict], Document]] = { + "/books": self._book_to_document, + "/chapters": self._chapter_to_document, + "/shelves": self._shelf_to_document, + "/pages": self._page_to_document, + } - start_ind = 0 - while True: - doc_batch, num_pages = self._get_doc_batch( - start_ind, time_filter=lambda t: start_time <= t <= end_time - ) - start_ind += num_pages - if doc_batch: - yield doc_batch + for endpoint, transform in transform_by_endpoint.items(): + start_ind = 0 + while True: + doc_batch, num_results = self._get_doc_batch(endpoint, transform, start_ind, start, end) + start_ind += num_results + if doc_batch: + yield doc_batch - if num_pages < self.batch_size: - break + if num_results < self.batch_size: + break + else: + time.sleep(0.2) diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx index 2c38bfe6a..7c730d3d7 100644 --- a/web/src/app/admin/connectors/bookstack/page.tsx +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -9,7 +9,7 @@ import { BookstackCredentialJson, BookstackConfig, Credential, - ConnectorIndexingStatus, ConfluenceConfig, + ConnectorIndexingStatus, } from "@/lib/types"; import useSWR, { useSWRConfig } from "swr"; import { fetcher } from "@/lib/fetcher"; From f587161577f0c3efab032d19fd5121a8b3f88a5d Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 16:02:53 +0100 Subject: [PATCH 3/6] Added bookstack to filters, changed inputType --- web/src/app/admin/connectors/bookstack/page.tsx | 2 +- web/src/components/search/Filters.tsx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx index 7c730d3d7..80f846d6c 100644 --- a/web/src/app/admin/connectors/bookstack/page.tsx +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -193,7 +193,7 @@ const Main = () => { `BookStackConnector` } source="bookstack" - inputType="load_state" + inputType="poll" formBody={ <> </> diff --git a/web/src/components/search/Filters.tsx b/web/src/components/search/Filters.tsx index 852c7fdd1..431e1a727 100644 --- a/web/src/components/search/Filters.tsx +++ b/web/src/components/search/Filters.tsx @@ -7,6 +7,7 @@ import { Source } from "@/lib/search/interfaces"; const sources: Source[] = [ { displayName: "Google Drive", internalName: "google_drive" }, { displayName: "Slack", internalName: "slack" }, + { displayName: "BookStack", internalName: "bookstack" }, { displayName: "Confluence", internalName: "confluence" }, { displayName: "Jira", internalName: "jira" }, { displayName: "Slab", internalName: "slab" }, From 104a248b112a7a6384ac32fb3c597f55be1ae8e7 Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 16:35:21 +0100 Subject: [PATCH 4/6] Cleaned up bookstack connector admin panel Also fixed ESLint issues --- .../app/admin/connectors/bookstack/page.tsx | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx index 80f846d6c..3d798f635 100644 --- a/web/src/app/admin/connectors/bookstack/page.tsx +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -67,7 +67,7 @@ const Main = () => { <> {popup} <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto"> - Step 1: Provide your access token + Step 1: Provide your API details </h2> {bookstackCredential ? ( @@ -99,11 +99,11 @@ const Main = () => { ) : ( <> <p className="text-sm"> - To get started you'll need API token details for your BookStack instance. + To get started you'll need API token details for your BookStack instance. You can get these by editing your (or another) user account in BookStack - and creating a token via the "API Tokens" section at the bottom. + and creating a token via the 'API Tokens' section at the bottom. Your user account will require to be assigned a BookStack role which - has the "Access system API" system permission assigned. + has the 'Access system API' system permission assigned. </p> <div className="border-solid border-gray-600 border rounded-md p-6 mt-2 mb-4"> <CredentialForm<BookstackCredentialJson> @@ -145,7 +145,7 @@ const Main = () => { </> )} - {bookstackConnectorIndexingStatuses.length > 0 && ( + {bookstackConnectorIndexingStatuses.length > 0 ? ( <> <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto"> BookStack indexing status @@ -184,42 +184,47 @@ const Main = () => { /> </div> </> + ) : ( + <> + <div className="border-solid border-gray-600 border rounded-md p-6 mt-4"> + <h2 className="font-bold mb-3">Create Connection</h2> + <p className="text-sm mb-4"> + Press connect below to start the connection to your BookStack instance. + </p> + <ConnectorForm<BookstackConfig> + nameBuilder={(values) => + `BookStackConnector` + } + source="bookstack" + inputType="poll" + formBody={ + <> + </> + } + validationSchema={Yup.object().shape({ + })} + initialValues={{ + }} + refreshFreq={10 * 60} // 10 minutes + onSubmit={async (isSuccess, responseJson) => { + if (isSuccess && responseJson) { + await linkCredential( + responseJson.id, + bookstackCredential.id + ); + mutate("/api/manage/admin/connector/indexing-status"); + } + }} + /> + </div> + </> )} - <div className="border-solid border-gray-600 border rounded-md p-6 mt-4"> - <h2 className="font-bold mb-3">Setup Connector</h2> - <ConnectorForm<BookstackConfig> - nameBuilder={(values) => - `BookStackConnector` - } - source="bookstack" - inputType="poll" - formBody={ - <> - </> - } - validationSchema={Yup.object().shape({ - })} - initialValues={{ - }} - refreshFreq={10 * 60} // 10 minutes - onSubmit={async (isSuccess, responseJson) => { - if (isSuccess && responseJson) { - await linkCredential( - responseJson.id, - bookstackCredential.id - ); - mutate("/api/manage/admin/connector/indexing-status"); - } - }} - /> - </div> - {!bookstackCredential && ( <> <p className="text-sm mb-4"> Please provide your API details in Step 1 first! Once done with that, - you'll be able to see indexing status. + you'll be able to start the connection then see indexing status. </p> </> )} From 019e474a4e1dbccfaff876b7db26d70f1a237cdd Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 17:04:31 +0100 Subject: [PATCH 5/6] BookStack connector: Changed to use id-based document ids --- backend/danswer/connectors/bookstack/connector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index bab007875..22637cfed 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -72,7 +72,7 @@ class BookstackConnector(LoadConnector, PollConnector): url = self.bookstack_client.build_app_url("/books/" + book.get("slug")) text = book.get("name", "") + "\n" + book.get("description", "") return Document( - id=url, + id="book:" + str(book.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Book: " + book.get("name"), @@ -86,7 +86,7 @@ class BookstackConnector(LoadConnector, PollConnector): url = self.bookstack_client.build_app_url("/books/" + chapter.get("book_slug") + "/chapter/" + chapter.get("slug")) text = chapter.get("name", "") + "\n" + chapter.get("description", "") return Document( - id=url, + id="chapter:" + str(chapter.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Chapter: " + chapter.get("name"), @@ -100,7 +100,7 @@ class BookstackConnector(LoadConnector, PollConnector): url = self.bookstack_client.build_app_url("/shelves/" + shelf.get("slug")) text = shelf.get("name", "") + "\n" + shelf.get("description", "") return Document( - id=url, + id="shelf:" + str(shelf.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Shelf: " + shelf.get("name"), @@ -119,7 +119,7 @@ class BookstackConnector(LoadConnector, PollConnector): text = soup.get_text(HTML_SEPARATOR) time.sleep(0.1) return Document( - id=url, + id="page:" + page_id, sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Page: " + page_data.get("name"), From 148d9c358f491e15b1c9a72703263162015cd2d3 Mon Sep 17 00:00:00 2001 From: Dan Brown <ssddanbrown@googlemail.com> Date: Thu, 6 Jul 2023 17:24:04 +0100 Subject: [PATCH 6/6] Fixed incorrect active panel in BookStack connector --- web/src/app/admin/connectors/bookstack/page.tsx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/web/src/app/admin/connectors/bookstack/page.tsx b/web/src/app/admin/connectors/bookstack/page.tsx index 3d798f635..564493f95 100644 --- a/web/src/app/admin/connectors/bookstack/page.tsx +++ b/web/src/app/admin/connectors/bookstack/page.tsx @@ -145,7 +145,7 @@ const Main = () => { </> )} - {bookstackConnectorIndexingStatuses.length > 0 ? ( + {bookstackConnectorIndexingStatuses.length > 0 && ( <> <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto"> BookStack indexing status @@ -184,7 +184,9 @@ const Main = () => { /> </div> </> - ) : ( + )} + + {bookstackCredential && bookstackConnectorIndexingStatuses.length === 0 && ( <> <div className="border-solid border-gray-600 border rounded-md p-6 mt-4"> <h2 className="font-bold mb-3">Create Connection</h2>