From 7f4d1f27a00a0bec214d795df383f1e1b4f33e0e Mon Sep 17 00:00:00 2001
From: neo773 <62795688+neo773@users.noreply.github.com>
Date: Fri, 14 Feb 2025 07:28:05 +0530
Subject: [PATCH] Gitbook connector (#3991)
* add parser
* add tests
---
backend/onyx/configs/constants.py | 1 +
backend/onyx/connectors/factory.py | 2 +
backend/onyx/connectors/gitbook/__init__.py | 0
backend/onyx/connectors/gitbook/connector.py | 281 ++++++++++++++++++
.../gitbook/test_gitbook_connector.py | 81 +++++
web/src/components/icons/icons.tsx | 18 ++
web/src/lib/connectors/connectors.tsx | 14 +
web/src/lib/connectors/credentials.ts | 13 +
web/src/lib/sources.ts | 7 +
web/src/lib/types.ts | 1 +
10 files changed, 418 insertions(+)
create mode 100644 backend/onyx/connectors/gitbook/__init__.py
create mode 100644 backend/onyx/connectors/gitbook/connector.py
create mode 100644 backend/tests/daily/connectors/gitbook/test_gitbook_connector.py
diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py
index a2d951365..139f00fdf 100644
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -125,6 +125,7 @@ class DocumentSource(str, Enum):
GMAIL = "gmail"
REQUESTTRACKER = "requesttracker"
GITHUB = "github"
+ GITBOOK = "gitbook"
GITLAB = "gitlab"
GURU = "guru"
BOOKSTACK = "bookstack"
diff --git a/backend/onyx/connectors/factory.py b/backend/onyx/connectors/factory.py
index c7bbc3d70..d204f3c0b 100644
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -20,6 +20,7 @@ from onyx.connectors.egnyte.connector import EgnyteConnector
from onyx.connectors.file.connector import LocalFileConnector
from onyx.connectors.fireflies.connector import FirefliesConnector
from onyx.connectors.freshdesk.connector import FreshdeskConnector
+from onyx.connectors.gitbook.connector import GitbookConnector
from onyx.connectors.github.connector import GithubConnector
from onyx.connectors.gitlab.connector import GitlabConnector
from onyx.connectors.gmail.connector import GmailConnector
@@ -71,6 +72,7 @@ def identify_connector_class(
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
+ DocumentSource.GITBOOK: GitbookConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
DocumentSource.BOOKSTACK: BookstackConnector,
DocumentSource.CONFLUENCE: ConfluenceConnector,
diff --git a/backend/onyx/connectors/gitbook/__init__.py b/backend/onyx/connectors/gitbook/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/onyx/connectors/gitbook/connector.py b/backend/onyx/connectors/gitbook/connector.py
new file mode 100644
index 000000000..7f9d82a57
--- /dev/null
+++ b/backend/onyx/connectors/gitbook/connector.py
@@ -0,0 +1,281 @@
+import time
+from datetime import datetime
+from datetime import timezone
+from typing import Any
+from urllib.parse import urljoin
+
+import requests
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import Section
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+GITBOOK_API_BASE = "https://api.gitbook.com/v1/"
+
+
+class GitbookApiClient:
+ def __init__(self, access_token: str) -> None:
+ self.access_token = access_token
+
+ def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any:
+ headers = {
+ "Authorization": f"Bearer {self.access_token}",
+ "Content-Type": "application/json",
+ }
+
+ url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/"))
+ response = requests.get(url, headers=headers, params=params)
+ response.raise_for_status()
+ return response.json()
+
+ def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]:
+ return self.get(f"/spaces/{space_id}/content/page/{page_id}")
+
+
+def _extract_text_from_document(document: dict[str, Any]) -> str:
+ """Extract text content from GitBook document structure by parsing the document nodes
+ into markdown format."""
+
+ def parse_leaf(leaf):
+ text = leaf.get("text", "")
+ leaf.get("marks", [])
+ return text
+
+ def parse_text_node(node):
+ text = ""
+ for leaf in node.get("leaves", []):
+ text += parse_leaf(leaf)
+ return text
+
+ def parse_block_node(node):
+ block_type = node.get("type", "")
+ result = ""
+
+ if block_type == "heading-1":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"# {text}\n\n"
+
+ elif block_type == "heading-2":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"## {text}\n\n"
+
+ elif block_type == "heading-3":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"### {text}\n\n"
+
+ elif block_type == "heading-4":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"#### {text}\n\n"
+
+ elif block_type == "heading-5":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"##### {text}\n\n"
+
+ elif block_type == "heading-6":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"###### {text}\n\n"
+
+ elif block_type == "list-unordered":
+ for list_item in node.get("nodes", []):
+ paragraph = list_item.get("nodes", [])[0]
+ text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
+ result += f"* {text}\n"
+ result += "\n"
+
+ elif block_type == "paragraph":
+ text = "".join(parse_text_node(n) for n in node.get("nodes", []))
+ result = f"{text}\n\n"
+
+ elif block_type == "list-tasks":
+ for task_item in node.get("nodes", []):
+ checked = task_item.get("data", {}).get("checked", False)
+ paragraph = task_item.get("nodes", [])[0]
+ text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
+ checkbox = "[x]" if checked else "[ ]"
+ result += f"- {checkbox} {text}\n"
+ result += "\n"
+
+ elif block_type == "code":
+ for code_line in node.get("nodes", []):
+ if code_line.get("type") == "code-line":
+ text = "".join(
+ parse_text_node(n) for n in code_line.get("nodes", [])
+ )
+ result += f"{text}\n"
+ result += "\n"
+
+ elif block_type == "blockquote":
+ for quote_node in node.get("nodes", []):
+ if quote_node.get("type") == "paragraph":
+ text = "".join(
+ parse_text_node(n) for n in quote_node.get("nodes", [])
+ )
+ result += f"> {text}\n"
+ result += "\n"
+
+ elif block_type == "table":
+ records = node.get("data", {}).get("records", {})
+ definition = node.get("data", {}).get("definition", {})
+ view = node.get("data", {}).get("view", {})
+
+ columns = view.get("columns", [])
+
+ header_cells = []
+ for col_id in columns:
+ col_def = definition.get(col_id, {})
+ header_cells.append(col_def.get("title", ""))
+
+ result = "| " + " | ".join(header_cells) + " |\n"
+ result += "|" + "---|" * len(header_cells) + "\n"
+
+ sorted_records = sorted(
+ records.items(), key=lambda x: x[1].get("orderIndex", "")
+ )
+
+ for record_id, record_data in sorted_records:
+ values = record_data.get("values", {})
+ row_cells = []
+ for col_id in columns:
+ fragment_id = values.get(col_id, "")
+ fragment_text = ""
+ for fragment in node.get("fragments", []):
+ if fragment.get("fragment") == fragment_id:
+ for frag_node in fragment.get("nodes", []):
+ if frag_node.get("type") == "paragraph":
+ fragment_text = "".join(
+ parse_text_node(n)
+ for n in frag_node.get("nodes", [])
+ )
+ break
+ row_cells.append(fragment_text)
+ result += "| " + " | ".join(row_cells) + " |\n"
+
+ result += "\n"
+ return result
+
+ if not document or "document" not in document:
+ return ""
+
+ markdown = ""
+ nodes = document["document"].get("nodes", [])
+
+ for node in nodes:
+ markdown += parse_block_node(node)
+
+ return markdown
+
+
+def _convert_page_to_document(
+ client: GitbookApiClient, space_id: str, page: dict[str, Any]
+) -> Document:
+ page_id = page["id"]
+ page_content = client.get_page_content(space_id, page_id)
+
+ return Document(
+ id=f"gitbook-{space_id}-{page_id}",
+ sections=[
+ Section(
+ link=page.get("urls", {}).get("app", ""),
+ text=_extract_text_from_document(page_content),
+ )
+ ],
+ source=DocumentSource.GITBOOK,
+ semantic_identifier=page.get("title", ""),
+ doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace(
+ tzinfo=timezone.utc
+ ),
+ metadata={
+ "path": page.get("path", ""),
+ "type": page.get("type", ""),
+ "kind": page.get("kind", ""),
+ },
+ )
+
+
+class GitbookConnector(LoadConnector, PollConnector):
+ def __init__(
+ self,
+ space_id: str,
+ batch_size: int = INDEX_BATCH_SIZE,
+ ) -> None:
+ self.space_id = space_id
+ self.batch_size = batch_size
+ self.access_token: str | None = None
+ self.client: GitbookApiClient | None = None
+
+ def load_credentials(self, credentials: dict[str, Any]) -> None:
+ access_token = credentials.get("gitbook_api_key")
+ if not access_token:
+ raise ConnectorMissingCredentialError("GitBook access token")
+ self.access_token = access_token
+ self.client = GitbookApiClient(access_token)
+
+ def _fetch_all_pages(
+ self,
+ start: datetime | None = None,
+ end: datetime | None = None,
+ ) -> GenerateDocumentsOutput:
+ if not self.client:
+ raise ConnectorMissingCredentialError("GitBook")
+
+ try:
+ content = self.client.get(f"/spaces/{self.space_id}/content")
+ pages = content.get("pages", [])
+
+ current_batch = []
+ for page in pages:
+ updated_at = datetime.fromisoformat(page["updatedAt"])
+
+ if start and updated_at < start:
+ if current_batch:
+ yield current_batch
+ return
+ if end and updated_at > end:
+ continue
+
+ current_batch.append(
+ _convert_page_to_document(self.client, self.space_id, page)
+ )
+
+ if len(current_batch) >= self.batch_size:
+ yield current_batch
+ current_batch = []
+ time.sleep(0.1) # Rate limiting
+
+ if current_batch:
+ yield current_batch
+
+ except requests.RequestException as e:
+ logger.error(f"Error fetching GitBook content: {str(e)}")
+ raise
+
+ def load_from_state(self) -> GenerateDocumentsOutput:
+ return self._fetch_all_pages()
+
+ def poll_source(
+ self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
+ ) -> GenerateDocumentsOutput:
+ start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
+ end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
+ return self._fetch_all_pages(start_datetime, end_datetime)
+
+
+if __name__ == "__main__":
+ import os
+
+ connector = GitbookConnector(
+ space_id=os.environ["GITBOOK_SPACE_ID"],
+ )
+ connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]})
+ document_batches = connector.load_from_state()
+ print(next(document_batches))
diff --git a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py
new file mode 100644
index 000000000..4d3b0cefb
--- /dev/null
+++ b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py
@@ -0,0 +1,81 @@
+import os
+import time
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.gitbook.connector import GitbookConnector
+
+
+@pytest.fixture
+def gitbook_connector() -> GitbookConnector:
+ connector = GitbookConnector(
+ space_id=os.environ["GITBOOK_SPACE_ID"],
+ )
+ connector.load_credentials(
+ {
+ "gitbook_api_key": os.environ["GITBOOK_API_KEY"],
+ }
+ )
+ return connector
+
+
+def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
+ doc_batch_generator = gitbook_connector.load_from_state()
+
+ # Get first batch of documents
+ doc_batch = next(doc_batch_generator)
+ assert len(doc_batch) > 0
+
+ # Verify first document structure
+ doc = doc_batch[0]
+
+ # Basic document properties
+ assert doc.id.startswith("gitbook-")
+ assert doc.semantic_identifier == "Acme Corp Internal Handbook"
+ assert doc.source == DocumentSource.GITBOOK
+
+ # Metadata checks
+ assert "path" in doc.metadata
+ assert "type" in doc.metadata
+ assert "kind" in doc.metadata
+
+ # Section checks
+ assert len(doc.sections) == 1
+ section = doc.sections[0]
+
+ # Content specific checks
+ content = section.text
+
+ # Check for specific content elements
+ assert "* Fruit Shopping List:" in content
+ assert "> test quote it doesn't mean anything" in content
+
+ # Check headings
+ assert "# Heading 1" in content
+ assert "## Heading 2" in content
+ assert "### Heading 3" in content
+
+ # Check task list
+ assert "- [ ] Uncompleted Task" in content
+ assert "- [x] Completed Task" in content
+
+ # Check table content
+ assert "| ethereum | 10 | 3000 |" in content
+ assert "| bitcoin | 2 | 98000 |" in content
+
+ # Check paragraph content
+ assert "New York City comprises 5 boroughs" in content
+ assert "Empire State Building" in content
+
+ # Check code block (just verify presence of some unique code elements)
+ assert "function fizzBuzz(n)" in content
+ assert 'res.push("FizzBuzz")' in content
+
+ assert section.link # Should have a URL
+
+ # Time-based polling test
+ current_time = time.time()
+ poll_docs = gitbook_connector.poll_source(0, current_time)
+ poll_batch = next(poll_docs)
+ assert len(poll_batch) > 0
diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx
index 923e1606c..b67830111 100644
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -2889,6 +2889,24 @@ export const AirtableIcon = ({
return ;
};
+export const GitbookIcon = ({
+ size = 16,
+ className = defaultTailwindCSS,
+}: IconProps) => (
+
+);
+
export const PinnedIcon = ({
size = 16,
className = defaultTailwindCSS,
diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx
index 58958c742..936383f28 100644
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -227,6 +227,20 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
+ gitbook: {
+ description: "Configure GitBook connector",
+ values: [
+ {
+ type: "text",
+ query: "Enter the space ID:",
+ label: "Space ID",
+ name: "space_id",
+ optional: false,
+ description: "The ID of the GitBook space to index.",
+ },
+ ],
+ advanced_values: [],
+ },
google_drive: {
description: "Configure Google Drive connector",
values: [
diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts
index b32d9b54d..22482f0fa 100644
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -30,6 +30,11 @@ export interface GithubCredentialJson {
github_access_token: string;
}
+export interface GitbookCredentialJson {
+ gitbook_space_id: string;
+ gitbook_api_key: string;
+}
+
export interface GitlabCredentialJson {
gitlab_url: string;
gitlab_access_token: string;
@@ -344,6 +349,10 @@ export const credentialTemplates: Record = {
// NOTE: These are Special Cases
google_drive: { google_tokens: "" } as GoogleDriveCredentialJson,
gmail: { google_tokens: "" } as GmailCredentialJson,
+ gitbook: {
+ gitbook_space_id: "",
+ gitbook_api_key: "",
+ } as GitbookCredentialJson,
};
export const credentialDisplayNames: Record = {
@@ -474,6 +483,10 @@ export const credentialDisplayNames: Record = {
// Fireflies
fireflies_api_key: "Fireflies API Key",
+
+ // GitBook
+ gitbook_space_id: "GitBook Space ID",
+ gitbook_api_key: "GitBook API Key",
};
export function getDisplayNameForCredentialKey(key: string): string {
diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts
index a75f8aeb9..53fb130b1 100644
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -43,6 +43,7 @@ import {
AirtableIcon,
GlobeIcon2,
FileIcon2,
+ GitbookIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
@@ -321,6 +322,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Other,
docs: "https://docs.onyx.app/connectors/airtable",
},
+ gitbook: {
+ icon: GitbookIcon,
+ displayName: "GitBook",
+ category: SourceCategory.CodeRepository,
+ docs: "https://docs.onyx.app/connectors/gitbook",
+ },
// currently used for the Internet Search tool docs, which is why
// a globe is used
not_applicable: {
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 8ad658fb0..a250df10f 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -350,6 +350,7 @@ export enum ValidSources {
Fireflies = "fireflies",
Egnyte = "egnyte",
Airtable = "airtable",
+ Gitbook = "gitbook",
}
export const validAutoSyncSources = [