mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-19 00:00:37 +02:00
217 lines
7.8 KiB
Python
217 lines
7.8 KiB
Python
import html
|
|
import time
|
|
from collections.abc import Callable
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.bookstack.client import BookStackApiClient
|
|
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
|
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
|
from onyx.connectors.interfaces import LoadConnector
|
|
from onyx.connectors.interfaces import PollConnector
|
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
|
from onyx.connectors.models import ConnectorMissingCredentialError
|
|
from onyx.connectors.models import Document
|
|
from onyx.connectors.models import Section
|
|
from onyx.file_processing.html_utils import parse_html_page_basic
|
|
|
|
|
|
class BookstackConnector(LoadConnector, PollConnector):
|
|
def __init__(
|
|
self,
|
|
batch_size: int = INDEX_BATCH_SIZE,
|
|
) -> None:
|
|
self.batch_size = batch_size
|
|
self.bookstack_client: BookStackApiClient | None = None
|
|
|
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
|
self.bookstack_client = BookStackApiClient(
|
|
base_url=credentials["bookstack_base_url"],
|
|
token_id=credentials["bookstack_api_token_id"],
|
|
token_secret=credentials["bookstack_api_token_secret"],
|
|
)
|
|
return None
|
|
|
|
@staticmethod
|
|
def _get_doc_batch(
|
|
batch_size: int,
|
|
bookstack_client: BookStackApiClient,
|
|
endpoint: str,
|
|
transformer: Callable[[BookStackApiClient, dict], Document],
|
|
start_ind: int,
|
|
start: SecondsSinceUnixEpoch | None = None,
|
|
end: SecondsSinceUnixEpoch | None = None,
|
|
) -> tuple[list[Document], int]:
|
|
params = {
|
|
"count": str(batch_size),
|
|
"offset": str(start_ind),
|
|
"sort": "+id",
|
|
}
|
|
|
|
if start:
|
|
params["filter[updated_at:gte]"] = datetime.utcfromtimestamp(
|
|
start
|
|
).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
if end:
|
|
params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime(
|
|
"%Y-%m-%d %H:%M:%S"
|
|
)
|
|
|
|
batch = bookstack_client.get(endpoint, params=params).get("data", [])
|
|
doc_batch = [transformer(bookstack_client, item) for item in batch]
|
|
|
|
return doc_batch, len(batch)
|
|
|
|
@staticmethod
|
|
def _book_to_document(
|
|
bookstack_client: BookStackApiClient, book: dict[str, Any]
|
|
) -> Document:
|
|
url = bookstack_client.build_app_url("/books/" + str(book.get("slug")))
|
|
title = str(book.get("name", ""))
|
|
text = book.get("name", "") + "\n" + book.get("description", "")
|
|
updated_at_str = (
|
|
str(book.get("updated_at")) if book.get("updated_at") is not None else None
|
|
)
|
|
return Document(
|
|
id="book__" + str(book.get("id")),
|
|
sections=[Section(link=url, text=text)],
|
|
source=DocumentSource.BOOKSTACK,
|
|
semantic_identifier="Book: " + title,
|
|
title=title,
|
|
doc_updated_at=time_str_to_utc(updated_at_str)
|
|
if updated_at_str is not None
|
|
else None,
|
|
metadata={"type": "book"},
|
|
)
|
|
|
|
@staticmethod
|
|
def _chapter_to_document(
|
|
bookstack_client: BookStackApiClient, chapter: dict[str, Any]
|
|
) -> Document:
|
|
url = bookstack_client.build_app_url(
|
|
"/books/"
|
|
+ str(chapter.get("book_slug"))
|
|
+ "/chapter/"
|
|
+ str(chapter.get("slug"))
|
|
)
|
|
title = str(chapter.get("name", ""))
|
|
text = chapter.get("name", "") + "\n" + chapter.get("description", "")
|
|
updated_at_str = (
|
|
str(chapter.get("updated_at"))
|
|
if chapter.get("updated_at") is not None
|
|
else None
|
|
)
|
|
return Document(
|
|
id="chapter__" + str(chapter.get("id")),
|
|
sections=[Section(link=url, text=text)],
|
|
source=DocumentSource.BOOKSTACK,
|
|
semantic_identifier="Chapter: " + title,
|
|
title=title,
|
|
doc_updated_at=time_str_to_utc(updated_at_str)
|
|
if updated_at_str is not None
|
|
else None,
|
|
metadata={"type": "chapter"},
|
|
)
|
|
|
|
@staticmethod
|
|
def _shelf_to_document(
|
|
bookstack_client: BookStackApiClient, shelf: dict[str, Any]
|
|
) -> Document:
|
|
url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug")))
|
|
title = str(shelf.get("name", ""))
|
|
text = shelf.get("name", "") + "\n" + shelf.get("description", "")
|
|
updated_at_str = (
|
|
str(shelf.get("updated_at"))
|
|
if shelf.get("updated_at") is not None
|
|
else None
|
|
)
|
|
return Document(
|
|
id="shelf:" + str(shelf.get("id")),
|
|
sections=[Section(link=url, text=text)],
|
|
source=DocumentSource.BOOKSTACK,
|
|
semantic_identifier="Shelf: " + title,
|
|
title=title,
|
|
doc_updated_at=time_str_to_utc(updated_at_str)
|
|
if updated_at_str is not None
|
|
else None,
|
|
metadata={"type": "shelf"},
|
|
)
|
|
|
|
@staticmethod
|
|
def _page_to_document(
|
|
bookstack_client: BookStackApiClient, page: dict[str, Any]
|
|
) -> Document:
|
|
page_id = str(page.get("id"))
|
|
title = str(page.get("name", ""))
|
|
page_data = bookstack_client.get("/pages/" + page_id, {})
|
|
url = bookstack_client.build_app_url(
|
|
"/books/"
|
|
+ str(page.get("book_slug"))
|
|
+ "/page/"
|
|
+ str(page_data.get("slug"))
|
|
)
|
|
page_html = "<h1>" + html.escape(title) + "</h1>" + str(page_data.get("html"))
|
|
text = parse_html_page_basic(page_html)
|
|
updated_at_str = (
|
|
str(page_data.get("updated_at"))
|
|
if page_data.get("updated_at") is not None
|
|
else None
|
|
)
|
|
time.sleep(0.1)
|
|
return Document(
|
|
id="page:" + page_id,
|
|
sections=[Section(link=url, text=text)],
|
|
source=DocumentSource.BOOKSTACK,
|
|
semantic_identifier="Page: " + str(title),
|
|
title=str(title),
|
|
doc_updated_at=time_str_to_utc(updated_at_str)
|
|
if updated_at_str is not None
|
|
else None,
|
|
metadata={"type": "page"},
|
|
)
|
|
|
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
if self.bookstack_client is None:
|
|
raise ConnectorMissingCredentialError("Bookstack")
|
|
|
|
return self.poll_source(None, None)
|
|
|
|
def poll_source(
|
|
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
|
) -> GenerateDocumentsOutput:
|
|
if self.bookstack_client is None:
|
|
raise ConnectorMissingCredentialError("Bookstack")
|
|
|
|
transform_by_endpoint: dict[
|
|
str, Callable[[BookStackApiClient, dict], Document]
|
|
] = {
|
|
"/books": self._book_to_document,
|
|
"/chapters": self._chapter_to_document,
|
|
"/shelves": self._shelf_to_document,
|
|
"/pages": self._page_to_document,
|
|
}
|
|
|
|
for endpoint, transform in transform_by_endpoint.items():
|
|
start_ind = 0
|
|
while True:
|
|
doc_batch, num_results = self._get_doc_batch(
|
|
batch_size=self.batch_size,
|
|
bookstack_client=self.bookstack_client,
|
|
endpoint=endpoint,
|
|
transformer=transform,
|
|
start_ind=start_ind,
|
|
start=start,
|
|
end=end,
|
|
)
|
|
start_ind += num_results
|
|
if doc_batch:
|
|
yield doc_batch
|
|
|
|
if num_results < self.batch_size:
|
|
break
|
|
else:
|
|
time.sleep(0.2)
|