From 7f4d1f27a00a0bec214d795df383f1e1b4f33e0e Mon Sep 17 00:00:00 2001 From: neo773 <62795688+neo773@users.noreply.github.com> Date: Fri, 14 Feb 2025 07:28:05 +0530 Subject: [PATCH] Gitbook connector (#3991) * add parser * add tests --- backend/onyx/configs/constants.py | 1 + backend/onyx/connectors/factory.py | 2 + backend/onyx/connectors/gitbook/__init__.py | 0 backend/onyx/connectors/gitbook/connector.py | 281 ++++++++++++++++++ .../gitbook/test_gitbook_connector.py | 81 +++++ web/src/components/icons/icons.tsx | 18 ++ web/src/lib/connectors/connectors.tsx | 14 + web/src/lib/connectors/credentials.ts | 13 + web/src/lib/sources.ts | 7 + web/src/lib/types.ts | 1 + 10 files changed, 418 insertions(+) create mode 100644 backend/onyx/connectors/gitbook/__init__.py create mode 100644 backend/onyx/connectors/gitbook/connector.py create mode 100644 backend/tests/daily/connectors/gitbook/test_gitbook_connector.py diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py index a2d951365..139f00fdf 100644 --- a/backend/onyx/configs/constants.py +++ b/backend/onyx/configs/constants.py @@ -125,6 +125,7 @@ class DocumentSource(str, Enum): GMAIL = "gmail" REQUESTTRACKER = "requesttracker" GITHUB = "github" + GITBOOK = "gitbook" GITLAB = "gitlab" GURU = "guru" BOOKSTACK = "bookstack" diff --git a/backend/onyx/connectors/factory.py b/backend/onyx/connectors/factory.py index c7bbc3d70..d204f3c0b 100644 --- a/backend/onyx/connectors/factory.py +++ b/backend/onyx/connectors/factory.py @@ -20,6 +20,7 @@ from onyx.connectors.egnyte.connector import EgnyteConnector from onyx.connectors.file.connector import LocalFileConnector from onyx.connectors.fireflies.connector import FirefliesConnector from onyx.connectors.freshdesk.connector import FreshdeskConnector +from onyx.connectors.gitbook.connector import GitbookConnector from onyx.connectors.github.connector import GithubConnector from onyx.connectors.gitlab.connector import GitlabConnector from onyx.connectors.gmail.connector import GmailConnector @@ -71,6 +72,7 @@ def identify_connector_class( DocumentSource.GITHUB: GithubConnector, DocumentSource.GMAIL: GmailConnector, DocumentSource.GITLAB: GitlabConnector, + DocumentSource.GITBOOK: GitbookConnector, DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector, DocumentSource.BOOKSTACK: BookstackConnector, DocumentSource.CONFLUENCE: ConfluenceConnector, diff --git a/backend/onyx/connectors/gitbook/__init__.py b/backend/onyx/connectors/gitbook/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/onyx/connectors/gitbook/connector.py b/backend/onyx/connectors/gitbook/connector.py new file mode 100644 index 000000000..7f9d82a57 --- /dev/null +++ b/backend/onyx/connectors/gitbook/connector.py @@ -0,0 +1,281 @@ +import time +from datetime import datetime +from datetime import timezone +from typing import Any +from urllib.parse import urljoin + +import requests + +from onyx.configs.app_configs import INDEX_BATCH_SIZE +from onyx.configs.constants import DocumentSource +from onyx.connectors.interfaces import GenerateDocumentsOutput +from onyx.connectors.interfaces import LoadConnector +from onyx.connectors.interfaces import PollConnector +from onyx.connectors.interfaces import SecondsSinceUnixEpoch +from onyx.connectors.models import ConnectorMissingCredentialError +from onyx.connectors.models import Document +from onyx.connectors.models import Section +from onyx.utils.logger import setup_logger + + +logger = setup_logger() + +GITBOOK_API_BASE = "https://api.gitbook.com/v1/" + + +class GitbookApiClient: + def __init__(self, access_token: str) -> None: + self.access_token = access_token + + def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any: + headers = { + "Authorization": f"Bearer {self.access_token}", + "Content-Type": "application/json", + } + + url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/")) + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + return response.json() + + def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]: + return self.get(f"/spaces/{space_id}/content/page/{page_id}") + + +def _extract_text_from_document(document: dict[str, Any]) -> str: + """Extract text content from GitBook document structure by parsing the document nodes + into markdown format.""" + + def parse_leaf(leaf): + text = leaf.get("text", "") + leaf.get("marks", []) + return text + + def parse_text_node(node): + text = "" + for leaf in node.get("leaves", []): + text += parse_leaf(leaf) + return text + + def parse_block_node(node): + block_type = node.get("type", "") + result = "" + + if block_type == "heading-1": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"# {text}\n\n" + + elif block_type == "heading-2": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"## {text}\n\n" + + elif block_type == "heading-3": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"### {text}\n\n" + + elif block_type == "heading-4": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"#### {text}\n\n" + + elif block_type == "heading-5": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"##### {text}\n\n" + + elif block_type == "heading-6": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"###### {text}\n\n" + + elif block_type == "list-unordered": + for list_item in node.get("nodes", []): + paragraph = list_item.get("nodes", [])[0] + text = "".join(parse_text_node(n) for n in paragraph.get("nodes", [])) + result += f"* {text}\n" + result += "\n" + + elif block_type == "paragraph": + text = "".join(parse_text_node(n) for n in node.get("nodes", [])) + result = f"{text}\n\n" + + elif block_type == "list-tasks": + for task_item in node.get("nodes", []): + checked = task_item.get("data", {}).get("checked", False) + paragraph = task_item.get("nodes", [])[0] + text = "".join(parse_text_node(n) for n in paragraph.get("nodes", [])) + checkbox = "[x]" if checked else "[ ]" + result += f"- {checkbox} {text}\n" + result += "\n" + + elif block_type == "code": + for code_line in node.get("nodes", []): + if code_line.get("type") == "code-line": + text = "".join( + parse_text_node(n) for n in code_line.get("nodes", []) + ) + result += f"{text}\n" + result += "\n" + + elif block_type == "blockquote": + for quote_node in node.get("nodes", []): + if quote_node.get("type") == "paragraph": + text = "".join( + parse_text_node(n) for n in quote_node.get("nodes", []) + ) + result += f"> {text}\n" + result += "\n" + + elif block_type == "table": + records = node.get("data", {}).get("records", {}) + definition = node.get("data", {}).get("definition", {}) + view = node.get("data", {}).get("view", {}) + + columns = view.get("columns", []) + + header_cells = [] + for col_id in columns: + col_def = definition.get(col_id, {}) + header_cells.append(col_def.get("title", "")) + + result = "| " + " | ".join(header_cells) + " |\n" + result += "|" + "---|" * len(header_cells) + "\n" + + sorted_records = sorted( + records.items(), key=lambda x: x[1].get("orderIndex", "") + ) + + for record_id, record_data in sorted_records: + values = record_data.get("values", {}) + row_cells = [] + for col_id in columns: + fragment_id = values.get(col_id, "") + fragment_text = "" + for fragment in node.get("fragments", []): + if fragment.get("fragment") == fragment_id: + for frag_node in fragment.get("nodes", []): + if frag_node.get("type") == "paragraph": + fragment_text = "".join( + parse_text_node(n) + for n in frag_node.get("nodes", []) + ) + break + row_cells.append(fragment_text) + result += "| " + " | ".join(row_cells) + " |\n" + + result += "\n" + return result + + if not document or "document" not in document: + return "" + + markdown = "" + nodes = document["document"].get("nodes", []) + + for node in nodes: + markdown += parse_block_node(node) + + return markdown + + +def _convert_page_to_document( + client: GitbookApiClient, space_id: str, page: dict[str, Any] +) -> Document: + page_id = page["id"] + page_content = client.get_page_content(space_id, page_id) + + return Document( + id=f"gitbook-{space_id}-{page_id}", + sections=[ + Section( + link=page.get("urls", {}).get("app", ""), + text=_extract_text_from_document(page_content), + ) + ], + source=DocumentSource.GITBOOK, + semantic_identifier=page.get("title", ""), + doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace( + tzinfo=timezone.utc + ), + metadata={ + "path": page.get("path", ""), + "type": page.get("type", ""), + "kind": page.get("kind", ""), + }, + ) + + +class GitbookConnector(LoadConnector, PollConnector): + def __init__( + self, + space_id: str, + batch_size: int = INDEX_BATCH_SIZE, + ) -> None: + self.space_id = space_id + self.batch_size = batch_size + self.access_token: str | None = None + self.client: GitbookApiClient | None = None + + def load_credentials(self, credentials: dict[str, Any]) -> None: + access_token = credentials.get("gitbook_api_key") + if not access_token: + raise ConnectorMissingCredentialError("GitBook access token") + self.access_token = access_token + self.client = GitbookApiClient(access_token) + + def _fetch_all_pages( + self, + start: datetime | None = None, + end: datetime | None = None, + ) -> GenerateDocumentsOutput: + if not self.client: + raise ConnectorMissingCredentialError("GitBook") + + try: + content = self.client.get(f"/spaces/{self.space_id}/content") + pages = content.get("pages", []) + + current_batch = [] + for page in pages: + updated_at = datetime.fromisoformat(page["updatedAt"]) + + if start and updated_at < start: + if current_batch: + yield current_batch + return + if end and updated_at > end: + continue + + current_batch.append( + _convert_page_to_document(self.client, self.space_id, page) + ) + + if len(current_batch) >= self.batch_size: + yield current_batch + current_batch = [] + time.sleep(0.1) # Rate limiting + + if current_batch: + yield current_batch + + except requests.RequestException as e: + logger.error(f"Error fetching GitBook content: {str(e)}") + raise + + def load_from_state(self) -> GenerateDocumentsOutput: + return self._fetch_all_pages() + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + ) -> GenerateDocumentsOutput: + start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) + end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + return self._fetch_all_pages(start_datetime, end_datetime) + + +if __name__ == "__main__": + import os + + connector = GitbookConnector( + space_id=os.environ["GITBOOK_SPACE_ID"], + ) + connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]}) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py new file mode 100644 index 000000000..4d3b0cefb --- /dev/null +++ b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py @@ -0,0 +1,81 @@ +import os +import time + +import pytest + +from onyx.configs.constants import DocumentSource +from onyx.connectors.gitbook.connector import GitbookConnector + + +@pytest.fixture +def gitbook_connector() -> GitbookConnector: + connector = GitbookConnector( + space_id=os.environ["GITBOOK_SPACE_ID"], + ) + connector.load_credentials( + { + "gitbook_api_key": os.environ["GITBOOK_API_KEY"], + } + ) + return connector + + +def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: + doc_batch_generator = gitbook_connector.load_from_state() + + # Get first batch of documents + doc_batch = next(doc_batch_generator) + assert len(doc_batch) > 0 + + # Verify first document structure + doc = doc_batch[0] + + # Basic document properties + assert doc.id.startswith("gitbook-") + assert doc.semantic_identifier == "Acme Corp Internal Handbook" + assert doc.source == DocumentSource.GITBOOK + + # Metadata checks + assert "path" in doc.metadata + assert "type" in doc.metadata + assert "kind" in doc.metadata + + # Section checks + assert len(doc.sections) == 1 + section = doc.sections[0] + + # Content specific checks + content = section.text + + # Check for specific content elements + assert "* Fruit Shopping List:" in content + assert "> test quote it doesn't mean anything" in content + + # Check headings + assert "# Heading 1" in content + assert "## Heading 2" in content + assert "### Heading 3" in content + + # Check task list + assert "- [ ] Uncompleted Task" in content + assert "- [x] Completed Task" in content + + # Check table content + assert "| ethereum | 10 | 3000 |" in content + assert "| bitcoin | 2 | 98000 |" in content + + # Check paragraph content + assert "New York City comprises 5 boroughs" in content + assert "Empire State Building" in content + + # Check code block (just verify presence of some unique code elements) + assert "function fizzBuzz(n)" in content + assert 'res.push("FizzBuzz")' in content + + assert section.link # Should have a URL + + # Time-based polling test + current_time = time.time() + poll_docs = gitbook_connector.poll_source(0, current_time) + poll_batch = next(poll_docs) + assert len(poll_batch) > 0 diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index 923e1606c..b67830111 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -2889,6 +2889,24 @@ export const AirtableIcon = ({ return ; }; +export const GitbookIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => ( + + + +); + export const PinnedIcon = ({ size = 16, className = defaultTailwindCSS, diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 58958c742..936383f28 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -227,6 +227,20 @@ export const connectorConfigs: Record< ], advanced_values: [], }, + gitbook: { + description: "Configure GitBook connector", + values: [ + { + type: "text", + query: "Enter the space ID:", + label: "Space ID", + name: "space_id", + optional: false, + description: "The ID of the GitBook space to index.", + }, + ], + advanced_values: [], + }, google_drive: { description: "Configure Google Drive connector", values: [ diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts index b32d9b54d..22482f0fa 100644 --- a/web/src/lib/connectors/credentials.ts +++ b/web/src/lib/connectors/credentials.ts @@ -30,6 +30,11 @@ export interface GithubCredentialJson { github_access_token: string; } +export interface GitbookCredentialJson { + gitbook_space_id: string; + gitbook_api_key: string; +} + export interface GitlabCredentialJson { gitlab_url: string; gitlab_access_token: string; @@ -344,6 +349,10 @@ export const credentialTemplates: Record = { // NOTE: These are Special Cases google_drive: { google_tokens: "" } as GoogleDriveCredentialJson, gmail: { google_tokens: "" } as GmailCredentialJson, + gitbook: { + gitbook_space_id: "", + gitbook_api_key: "", + } as GitbookCredentialJson, }; export const credentialDisplayNames: Record = { @@ -474,6 +483,10 @@ export const credentialDisplayNames: Record = { // Fireflies fireflies_api_key: "Fireflies API Key", + + // GitBook + gitbook_space_id: "GitBook Space ID", + gitbook_api_key: "GitBook API Key", }; export function getDisplayNameForCredentialKey(key: string): string { diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts index a75f8aeb9..53fb130b1 100644 --- a/web/src/lib/sources.ts +++ b/web/src/lib/sources.ts @@ -43,6 +43,7 @@ import { AirtableIcon, GlobeIcon2, FileIcon2, + GitbookIcon, } from "@/components/icons/icons"; import { ValidSources } from "./types"; import { @@ -321,6 +322,12 @@ export const SOURCE_METADATA_MAP: SourceMap = { category: SourceCategory.Other, docs: "https://docs.onyx.app/connectors/airtable", }, + gitbook: { + icon: GitbookIcon, + displayName: "GitBook", + category: SourceCategory.CodeRepository, + docs: "https://docs.onyx.app/connectors/gitbook", + }, // currently used for the Internet Search tool docs, which is why // a globe is used not_applicable: { diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 8ad658fb0..a250df10f 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -350,6 +350,7 @@ export enum ValidSources { Fireflies = "fireflies", Egnyte = "egnyte", Airtable = "airtable", + Gitbook = "gitbook", } export const validAutoSyncSources = [