danswer/backend/onyx/connectors/gitbook/connector.py

from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import urljoin

import requests

from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger


logger = setup_logger()

GITBOOK_API_BASE = "https://api.gitbook.com/v1/"


class GitbookApiClient:
    def __init__(self, access_token: str) -> None:
        self.access_token = access_token

    def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any:
        headers = {
            "Authorization": f"Bearer {self.access_token}",
            "Content-Type": "application/json",
        }

        url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/"))
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        return response.json()

    def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]:
        return self.get(f"/spaces/{space_id}/content/page/{page_id}")


def _extract_text_from_document(document: dict[str, Any]) -> str:
    """Extract text content from GitBook document structure by parsing the document nodes
    into markdown format."""

    def parse_leaf(leaf: dict[str, Any]) -> str:
        text = leaf.get("text", "")
        leaf.get("marks", [])
        return text

    def parse_text_node(node: dict[str, Any]) -> str:
        text = ""
        for leaf in node.get("leaves", []):
            text += parse_leaf(leaf)
        return text

    def parse_block_node(node: dict[str, Any]) -> str:
        block_type = node.get("type", "")
        result = ""

        if block_type == "heading-1":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"# {text}\n\n"

        elif block_type == "heading-2":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"## {text}\n\n"

        elif block_type == "heading-3":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"### {text}\n\n"

        elif block_type == "heading-4":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"#### {text}\n\n"

        elif block_type == "heading-5":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"##### {text}\n\n"

        elif block_type == "heading-6":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"###### {text}\n\n"

        elif block_type == "list-unordered":
            for list_item in node.get("nodes", []):
                paragraph = list_item.get("nodes", [])[0]
                text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
                result += f"* {text}\n"
            result += "\n"

        elif block_type == "paragraph":
            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
            result = f"{text}\n\n"

        elif block_type == "list-tasks":
            for task_item in node.get("nodes", []):
                checked = task_item.get("data", {}).get("checked", False)
                paragraph = task_item.get("nodes", [])[0]
                text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
                checkbox = "[x]" if checked else "[ ]"
                result += f"- {checkbox} {text}\n"
            result += "\n"

        elif block_type == "code":
            for code_line in node.get("nodes", []):
                if code_line.get("type") == "code-line":
                    text = "".join(
                        parse_text_node(n) for n in code_line.get("nodes", [])
                    )
                    result += f"{text}\n"
            result += "\n"

        elif block_type == "blockquote":
            for quote_node in node.get("nodes", []):
                if quote_node.get("type") == "paragraph":
                    text = "".join(
                        parse_text_node(n) for n in quote_node.get("nodes", [])
                    )
                    result += f"> {text}\n"
            result += "\n"

        elif block_type == "table":
            records = node.get("data", {}).get("records", {})
            definition = node.get("data", {}).get("definition", {})
            view = node.get("data", {}).get("view", {})

            columns = view.get("columns", [])

            header_cells = []
            for col_id in columns:
                col_def = definition.get(col_id, {})
                header_cells.append(col_def.get("title", ""))

            result = "| " + " | ".join(header_cells) + " |\n"
            result += "|" + "---|" * len(header_cells) + "\n"

            sorted_records = sorted(
                records.items(), key=lambda x: x[1].get("orderIndex", "")
            )

            for record_id, record_data in sorted_records:
                values = record_data.get("values", {})
                row_cells = []
                for col_id in columns:
                    fragment_id = values.get(col_id, "")
                    fragment_text = ""
                    for fragment in node.get("fragments", []):
                        if fragment.get("fragment") == fragment_id:
                            for frag_node in fragment.get("nodes", []):
                                if frag_node.get("type") == "paragraph":
                                    fragment_text = "".join(
                                        parse_text_node(n)
                                        for n in frag_node.get("nodes", [])
                                    )
                                    break
                    row_cells.append(fragment_text)
                result += "| " + " | ".join(row_cells) + " |\n"

            result += "\n"
        return result

    if not document or "document" not in document:
        return ""

    markdown = ""
    nodes = document["document"].get("nodes", [])

    for node in nodes:
        markdown += parse_block_node(node)

    return markdown


def _convert_page_to_document(
    client: GitbookApiClient, space_id: str, page: dict[str, Any]
) -> Document:
    page_id = page["id"]
    page_content = client.get_page_content(space_id, page_id)

    return Document(
        id=f"gitbook-{space_id}-{page_id}",
        sections=[
            TextSection(
                link=page.get("urls", {}).get("app", ""),
                text=_extract_text_from_document(page_content),
            )
        ],
        source=DocumentSource.GITBOOK,
        semantic_identifier=page.get("title", ""),
        doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace(
            tzinfo=timezone.utc
        ),
        metadata={
            "path": page.get("path", ""),
            "type": page.get("type", ""),
            "kind": page.get("kind", ""),
        },
    )


class GitbookConnector(LoadConnector, PollConnector):
    def __init__(
        self,
        space_id: str,
        batch_size: int = INDEX_BATCH_SIZE,
    ) -> None:
        self.space_id = space_id
        self.batch_size = batch_size
        self.access_token: str | None = None
        self.client: GitbookApiClient | None = None

    def load_credentials(self, credentials: dict[str, Any]) -> None:
        access_token = credentials.get("gitbook_api_key")
        if not access_token:
            raise ConnectorMissingCredentialError("GitBook access token")
        self.access_token = access_token
        self.client = GitbookApiClient(access_token)

    def _fetch_all_pages(
        self,
        start: datetime | None = None,
        end: datetime | None = None,
    ) -> GenerateDocumentsOutput:
        if not self.client:
            raise ConnectorMissingCredentialError("GitBook")

        try:
            content = self.client.get(f"/spaces/{self.space_id}/content/pages")
            pages: list[dict[str, Any]] = content.get("pages", [])
            current_batch: list[Document] = []

            logger.info(f"Found {len(pages)} root pages.")
            logger.info(
                f"First 20 Page Ids: {[page.get('id', 'Unknown') for page in pages[:20]]}"
            )

            while pages:
                page = pages.pop(0)

                updated_at_raw = page.get("updatedAt")
                if updated_at_raw is None:
                    # if updatedAt is not present, that means the page has never been edited
                    continue

                updated_at = datetime.fromisoformat(updated_at_raw)
                if start and updated_at < start:
                    continue
                if end and updated_at > end:
                    continue

                current_batch.append(
                    _convert_page_to_document(self.client, self.space_id, page)
                )

                if len(current_batch) >= self.batch_size:
                    yield current_batch
                    current_batch = []

                pages.extend(page.get("pages", []))

            if current_batch:
                yield current_batch

        except requests.RequestException as e:
            logger.error(f"Error fetching GitBook content: {str(e)}")
            raise

    def load_from_state(self) -> GenerateDocumentsOutput:
        return self._fetch_all_pages()

    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
        return self._fetch_all_pages(start_datetime, end_datetime)


if __name__ == "__main__":
    import os

    connector = GitbookConnector(
        space_id=os.environ["GITBOOK_SPACE_ID"],
    )
    connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]})
    document_batches = connector.load_from_state()
    print(next(document_batches))