import html import time from collections.abc import Callable from datetime import datetime from typing import Any from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.constants import DocumentSource from onyx.connectors.bookstack.client import BookStackApiClient from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document from onyx.connectors.models import Section from onyx.file_processing.html_utils import parse_html_page_basic class BookstackConnector(LoadConnector, PollConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.batch_size = batch_size self.bookstack_client: BookStackApiClient | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: self.bookstack_client = BookStackApiClient( base_url=credentials["bookstack_base_url"], token_id=credentials["bookstack_api_token_id"], token_secret=credentials["bookstack_api_token_secret"], ) return None @staticmethod def _get_doc_batch( batch_size: int, bookstack_client: BookStackApiClient, endpoint: str, transformer: Callable[[BookStackApiClient, dict], Document], start_ind: int, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> tuple[list[Document], int]: params = { "count": str(batch_size), "offset": str(start_ind), "sort": "+id", } if start: params["filter[updated_at:gte]"] = datetime.utcfromtimestamp( start ).strftime("%Y-%m-%d %H:%M:%S") if end: params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime( "%Y-%m-%d %H:%M:%S" ) batch = bookstack_client.get(endpoint, params=params).get("data", []) doc_batch = [transformer(bookstack_client, item) for item in batch] return doc_batch, len(batch) @staticmethod def _book_to_document( bookstack_client: BookStackApiClient, book: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/books/" + str(book.get("slug"))) title = str(book.get("name", "")) text = book.get("name", "") + "\n" + book.get("description", "") updated_at_str = ( str(book.get("updated_at")) if book.get("updated_at") is not None else None ) return Document( id="book__" + str(book.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Book: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "book"}, ) @staticmethod def _chapter_to_document( bookstack_client: BookStackApiClient, chapter: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url( "/books/" + str(chapter.get("book_slug")) + "/chapter/" + str(chapter.get("slug")) ) title = str(chapter.get("name", "")) text = chapter.get("name", "") + "\n" + chapter.get("description", "") updated_at_str = ( str(chapter.get("updated_at")) if chapter.get("updated_at") is not None else None ) return Document( id="chapter__" + str(chapter.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Chapter: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "chapter"}, ) @staticmethod def _shelf_to_document( bookstack_client: BookStackApiClient, shelf: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug"))) title = str(shelf.get("name", "")) text = shelf.get("name", "") + "\n" + shelf.get("description", "") updated_at_str = ( str(shelf.get("updated_at")) if shelf.get("updated_at") is not None else None ) return Document( id="shelf:" + str(shelf.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Shelf: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "shelf"}, ) @staticmethod def _page_to_document( bookstack_client: BookStackApiClient, page: dict[str, Any] ) -> Document: page_id = str(page.get("id")) title = str(page.get("name", "")) page_data = bookstack_client.get("/pages/" + page_id, {}) url = bookstack_client.build_app_url( "/books/" + str(page.get("book_slug")) + "/page/" + str(page_data.get("slug")) ) page_html = "

" + html.escape(title) + "

" + str(page_data.get("html")) text = parse_html_page_basic(page_html) updated_at_str = ( str(page_data.get("updated_at")) if page_data.get("updated_at") is not None else None ) time.sleep(0.1) return Document( id="page:" + page_id, sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Page: " + str(title), title=str(title), doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "page"}, ) def load_from_state(self) -> GenerateDocumentsOutput: if self.bookstack_client is None: raise ConnectorMissingCredentialError("Bookstack") return self.poll_source(None, None) def poll_source( self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: if self.bookstack_client is None: raise ConnectorMissingCredentialError("Bookstack") transform_by_endpoint: dict[ str, Callable[[BookStackApiClient, dict], Document] ] = { "/books": self._book_to_document, "/chapters": self._chapter_to_document, "/shelves": self._shelf_to_document, "/pages": self._page_to_document, } for endpoint, transform in transform_by_endpoint.items(): start_ind = 0 while True: doc_batch, num_results = self._get_doc_batch( batch_size=self.batch_size, bookstack_client=self.bookstack_client, endpoint=endpoint, transformer=transform, start_ind=start_ind, start=start, end=end, ) start_ind += num_results if doc_batch: yield doc_batch if num_results < self.batch_size: break else: time.sleep(0.2)