Gitbook connector (#3991)

* add parser * add tests
2025-09-18 11:34:12 +02:00 · 2025-02-14 07:28:05 +05:30
parent b70db15622
commit 7f4d1f27a0
10 changed files with 418 additions and 0 deletions
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -125,6 +125,7 @@ class DocumentSource(str, Enum):
    GMAIL = "gmail"
    REQUESTTRACKER = "requesttracker"
    GITHUB = "github"
+    GITBOOK = "gitbook"
    GITLAB = "gitlab"
    GURU = "guru"
    BOOKSTACK = "bookstack"
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -20,6 +20,7 @@ from onyx.connectors.egnyte.connector import EgnyteConnector
 from onyx.connectors.file.connector import LocalFileConnector
 from onyx.connectors.fireflies.connector import FirefliesConnector
 from onyx.connectors.freshdesk.connector import FreshdeskConnector
+from onyx.connectors.gitbook.connector import GitbookConnector
 from onyx.connectors.github.connector import GithubConnector
 from onyx.connectors.gitlab.connector import GitlabConnector
 from onyx.connectors.gmail.connector import GmailConnector
@@ -71,6 +72,7 @@ def identify_connector_class(
        DocumentSource.GITHUB: GithubConnector,
        DocumentSource.GMAIL: GmailConnector,
        DocumentSource.GITLAB: GitlabConnector,
+        DocumentSource.GITBOOK: GitbookConnector,
        DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
        DocumentSource.BOOKSTACK: BookstackConnector,
        DocumentSource.CONFLUENCE: ConfluenceConnector,
--- a/backend/onyx/connectors/gitbook/init.py
+++ b/backend/onyx/connectors/gitbook/init.py
--- a/backend/onyx/connectors/gitbook/connector.py
+++ b/backend/onyx/connectors/gitbook/connector.py
@@ -0,0 +1,281 @@
+import time
+from datetime import datetime
+from datetime import timezone
+from typing import Any
+from urllib.parse import urljoin
+
+import requests
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import Section
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+GITBOOK_API_BASE = "https://api.gitbook.com/v1/"
+
+
+class GitbookApiClient:
+    def __init__(self, access_token: str) -> None:
+        self.access_token = access_token
+
+    def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any:
+        headers = {
+            "Authorization": f"Bearer {self.access_token}",
+            "Content-Type": "application/json",
+        }
+
+        url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/"))
+        response = requests.get(url, headers=headers, params=params)
+        response.raise_for_status()
+        return response.json()
+
+    def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]:
+        return self.get(f"/spaces/{space_id}/content/page/{page_id}")
+
+
+def _extract_text_from_document(document: dict[str, Any]) -> str:
+    """Extract text content from GitBook document structure by parsing the document nodes
+    into markdown format."""
+
+    def parse_leaf(leaf):
+        text = leaf.get("text", "")
+        leaf.get("marks", [])
+        return text
+
+    def parse_text_node(node):
+        text = ""
+        for leaf in node.get("leaves", []):
+            text += parse_leaf(leaf)
+        return text
+
+    def parse_block_node(node):
+        block_type = node.get("type", "")
+        result = ""
+
+        if block_type == "heading-1":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"# {text}\n\n"
+
+        elif block_type == "heading-2":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"## {text}\n\n"
+
+        elif block_type == "heading-3":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"### {text}\n\n"
+
+        elif block_type == "heading-4":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"#### {text}\n\n"
+
+        elif block_type == "heading-5":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"##### {text}\n\n"
+
+        elif block_type == "heading-6":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"###### {text}\n\n"
+
+        elif block_type == "list-unordered":
+            for list_item in node.get("nodes", []):
+                paragraph = list_item.get("nodes", [])[0]
+                text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
+                result += f"* {text}\n"
+            result += "\n"
+
+        elif block_type == "paragraph":
+            text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+            result = f"{text}\n\n"
+
+        elif block_type == "list-tasks":
+            for task_item in node.get("nodes", []):
+                checked = task_item.get("data", {}).get("checked", False)
+                paragraph = task_item.get("nodes", [])[0]
+                text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
+                checkbox = "[x]" if checked else "[ ]"
+                result += f"- {checkbox} {text}\n"
+            result += "\n"
+
+        elif block_type == "code":
+            for code_line in node.get("nodes", []):
+                if code_line.get("type") == "code-line":
+                    text = "".join(
+                        parse_text_node(n) for n in code_line.get("nodes", [])
+                    )
+                    result += f"{text}\n"
+            result += "\n"
+
+        elif block_type == "blockquote":
+            for quote_node in node.get("nodes", []):
+                if quote_node.get("type") == "paragraph":
+                    text = "".join(
+                        parse_text_node(n) for n in quote_node.get("nodes", [])
+                    )
+                    result += f"> {text}\n"
+            result += "\n"
+
+        elif block_type == "table":
+            records = node.get("data", {}).get("records", {})
+            definition = node.get("data", {}).get("definition", {})
+            view = node.get("data", {}).get("view", {})
+
+            columns = view.get("columns", [])
+
+            header_cells = []
+            for col_id in columns:
+                col_def = definition.get(col_id, {})
+                header_cells.append(col_def.get("title", ""))
+
+            result = "| " + " | ".join(header_cells) + " |\n"
+            result += "|" + "---|" * len(header_cells) + "\n"
+
+            sorted_records = sorted(
+                records.items(), key=lambda x: x[1].get("orderIndex", "")
+            )
+
+            for record_id, record_data in sorted_records:
+                values = record_data.get("values", {})
+                row_cells = []
+                for col_id in columns:
+                    fragment_id = values.get(col_id, "")
+                    fragment_text = ""
+                    for fragment in node.get("fragments", []):
+                        if fragment.get("fragment") == fragment_id:
+                            for frag_node in fragment.get("nodes", []):
+                                if frag_node.get("type") == "paragraph":
+                                    fragment_text = "".join(
+                                        parse_text_node(n)
+                                        for n in frag_node.get("nodes", [])
+                                    )
+                                    break
+                    row_cells.append(fragment_text)
+                result += "| " + " | ".join(row_cells) + " |\n"
+
+            result += "\n"
+        return result
+
+    if not document or "document" not in document:
+        return ""
+
+    markdown = ""
+    nodes = document["document"].get("nodes", [])
+
+    for node in nodes:
+        markdown += parse_block_node(node)
+
+    return markdown
+
+
+def _convert_page_to_document(
+    client: GitbookApiClient, space_id: str, page: dict[str, Any]
+) -> Document:
+    page_id = page["id"]
+    page_content = client.get_page_content(space_id, page_id)
+
+    return Document(
+        id=f"gitbook-{space_id}-{page_id}",
+        sections=[
+            Section(
+                link=page.get("urls", {}).get("app", ""),
+                text=_extract_text_from_document(page_content),
+            )
+        ],
+        source=DocumentSource.GITBOOK,
+        semantic_identifier=page.get("title", ""),
+        doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace(
+            tzinfo=timezone.utc
+        ),
+        metadata={
+            "path": page.get("path", ""),
+            "type": page.get("type", ""),
+            "kind": page.get("kind", ""),
+        },
+    )
+
+
+class GitbookConnector(LoadConnector, PollConnector):
+    def __init__(
+        self,
+        space_id: str,
+        batch_size: int = INDEX_BATCH_SIZE,
+    ) -> None:
+        self.space_id = space_id
+        self.batch_size = batch_size
+        self.access_token: str | None = None
+        self.client: GitbookApiClient | None = None
+
+    def load_credentials(self, credentials: dict[str, Any]) -> None:
+        access_token = credentials.get("gitbook_api_key")
+        if not access_token:
+            raise ConnectorMissingCredentialError("GitBook access token")
+        self.access_token = access_token
+        self.client = GitbookApiClient(access_token)
+
+    def _fetch_all_pages(
+        self,
+        start: datetime | None = None,
+        end: datetime | None = None,
+    ) -> GenerateDocumentsOutput:
+        if not self.client:
+            raise ConnectorMissingCredentialError("GitBook")
+
+        try:
+            content = self.client.get(f"/spaces/{self.space_id}/content")
+            pages = content.get("pages", [])
+
+            current_batch = []
+            for page in pages:
+                updated_at = datetime.fromisoformat(page["updatedAt"])
+
+                if start and updated_at < start:
+                    if current_batch:
+                        yield current_batch
+                    return
+                if end and updated_at > end:
+                    continue
+
+                current_batch.append(
+                    _convert_page_to_document(self.client, self.space_id, page)
+                )
+
+                if len(current_batch) >= self.batch_size:
+                    yield current_batch
+                    current_batch = []
+                    time.sleep(0.1)  # Rate limiting
+
+            if current_batch:
+                yield current_batch
+
+        except requests.RequestException as e:
+            logger.error(f"Error fetching GitBook content: {str(e)}")
+            raise
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        return self._fetch_all_pages()
+
+    def poll_source(
+        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
+    ) -> GenerateDocumentsOutput:
+        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
+        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
+        return self._fetch_all_pages(start_datetime, end_datetime)
+
+
+if __name__ == "__main__":
+    import os
+
+    connector = GitbookConnector(
+        space_id=os.environ["GITBOOK_SPACE_ID"],
+    )
+    connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]})
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py
+++ b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py
@@ -0,0 +1,81 @@
+import os
+import time
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.gitbook.connector import GitbookConnector
+
+
+@pytest.fixture
+def gitbook_connector() -> GitbookConnector:
+    connector = GitbookConnector(
+        space_id=os.environ["GITBOOK_SPACE_ID"],
+    )
+    connector.load_credentials(
+        {
+            "gitbook_api_key": os.environ["GITBOOK_API_KEY"],
+        }
+    )
+    return connector
+
+
+def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
+    doc_batch_generator = gitbook_connector.load_from_state()
+
+    # Get first batch of documents
+    doc_batch = next(doc_batch_generator)
+    assert len(doc_batch) > 0
+
+    # Verify first document structure
+    doc = doc_batch[0]
+
+    # Basic document properties
+    assert doc.id.startswith("gitbook-")
+    assert doc.semantic_identifier == "Acme Corp Internal Handbook"
+    assert doc.source == DocumentSource.GITBOOK
+
+    # Metadata checks
+    assert "path" in doc.metadata
+    assert "type" in doc.metadata
+    assert "kind" in doc.metadata
+
+    # Section checks
+    assert len(doc.sections) == 1
+    section = doc.sections[0]
+
+    # Content specific checks
+    content = section.text
+
+    # Check for specific content elements
+    assert "* Fruit Shopping List:" in content
+    assert "> test quote it doesn't mean anything" in content
+
+    # Check headings
+    assert "# Heading 1" in content
+    assert "## Heading 2" in content
+    assert "### Heading 3" in content
+
+    # Check task list
+    assert "- [ ] Uncompleted Task" in content
+    assert "- [x] Completed Task" in content
+
+    # Check table content
+    assert "| ethereum | 10 | 3000 |" in content
+    assert "| bitcoin | 2 | 98000 |" in content
+
+    # Check paragraph content
+    assert "New York City comprises 5 boroughs" in content
+    assert "Empire State Building" in content
+
+    # Check code block (just verify presence of some unique code elements)
+    assert "function fizzBuzz(n)" in content
+    assert 'res.push("FizzBuzz")' in content
+
+    assert section.link  # Should have a URL
+
+    # Time-based polling test
+    current_time = time.time()
+    poll_docs = gitbook_connector.poll_source(0, current_time)
+    poll_batch = next(poll_docs)
+    assert len(poll_batch) > 0