import html import time from collections.abc import Callable from datetime import datetime from typing import Any from onyx.configs.app_configs import INDEX_BATCH_SIZE from onyx.configs.constants import DocumentSource from onyx.connectors.bookstack.client import BookStackApiClient from onyx.connectors.bookstack.client import BookStackClientRequestFailedError from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from onyx.connectors.exceptions import ConnectorValidationError from onyx.connectors.exceptions import CredentialExpiredError from onyx.connectors.exceptions import InsufficientPermissionsError from onyx.connectors.interfaces import GenerateDocumentsOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch from onyx.connectors.models import ConnectorMissingCredentialError from onyx.connectors.models import Document from onyx.connectors.models import Section from onyx.file_processing.html_utils import parse_html_page_basic class BookstackConnector(LoadConnector, PollConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.batch_size = batch_size self.bookstack_client: BookStackApiClient | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: self.bookstack_client = BookStackApiClient( base_url=credentials["bookstack_base_url"], token_id=credentials["bookstack_api_token_id"], token_secret=credentials["bookstack_api_token_secret"], ) return None @staticmethod def _get_doc_batch( batch_size: int, bookstack_client: BookStackApiClient, endpoint: str, transformer: Callable[[BookStackApiClient, dict], Document], start_ind: int, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> tuple[list[Document], int]: params = { "count": str(batch_size), "offset": str(start_ind), "sort": "+id", } if start: params["filter[updated_at:gte]"] = datetime.utcfromtimestamp( start ).strftime("%Y-%m-%d %H:%M:%S") if end: params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime( "%Y-%m-%d %H:%M:%S" ) batch = bookstack_client.get(endpoint, params=params).get("data", []) doc_batch = [transformer(bookstack_client, item) for item in batch] return doc_batch, len(batch) @staticmethod def _book_to_document( bookstack_client: BookStackApiClient, book: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/books/" + str(book.get("slug"))) title = str(book.get("name", "")) text = book.get("name", "") + "\n" + book.get("description", "") updated_at_str = ( str(book.get("updated_at")) if book.get("updated_at") is not None else None ) return Document( id="book__" + str(book.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Book: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "book"}, ) @staticmethod def _chapter_to_document( bookstack_client: BookStackApiClient, chapter: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url( "/books/" + str(chapter.get("book_slug")) + "/chapter/" + str(chapter.get("slug")) ) title = str(chapter.get("name", "")) text = chapter.get("name", "") + "\n" + chapter.get("description", "") updated_at_str = ( str(chapter.get("updated_at")) if chapter.get("updated_at") is not None else None ) return Document( id="chapter__" + str(chapter.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Chapter: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "chapter"}, ) @staticmethod def _shelf_to_document( bookstack_client: BookStackApiClient, shelf: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug"))) title = str(shelf.get("name", "")) text = shelf.get("name", "") + "\n" + shelf.get("description", "") updated_at_str = ( str(shelf.get("updated_at")) if shelf.get("updated_at") is not None else None ) return Document( id="shelf:" + str(shelf.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, semantic_identifier="Shelf: " + title, title=title, doc_updated_at=time_str_to_utc(updated_at_str) if updated_at_str is not None else None, metadata={"type": "shelf"}, ) @staticmethod def _page_to_document( bookstack_client: BookStackApiClient, page: dict[str, Any] ) -> Document: page_id = str(page.get("id")) title = str(page.get("name", "")) page_data = bookstack_client.get("/pages/" + page_id, {}) url = bookstack_client.build_app_url( "/books/" + str(page.get("book_slug")) + "/page/" + str(page_data.get("slug")) ) page_html = "