Gitbook connector (#3991)

* add parser

* add tests
This commit is contained in:
neo773 2025-02-14 07:28:05 +05:30 committed by GitHub
parent b70db15622
commit 7f4d1f27a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 418 additions and 0 deletions

View File

@ -125,6 +125,7 @@ class DocumentSource(str, Enum):
GMAIL = "gmail"
REQUESTTRACKER = "requesttracker"
GITHUB = "github"
GITBOOK = "gitbook"
GITLAB = "gitlab"
GURU = "guru"
BOOKSTACK = "bookstack"

View File

@ -20,6 +20,7 @@ from onyx.connectors.egnyte.connector import EgnyteConnector
from onyx.connectors.file.connector import LocalFileConnector
from onyx.connectors.fireflies.connector import FirefliesConnector
from onyx.connectors.freshdesk.connector import FreshdeskConnector
from onyx.connectors.gitbook.connector import GitbookConnector
from onyx.connectors.github.connector import GithubConnector
from onyx.connectors.gitlab.connector import GitlabConnector
from onyx.connectors.gmail.connector import GmailConnector
@ -71,6 +72,7 @@ def identify_connector_class(
DocumentSource.GITHUB: GithubConnector,
DocumentSource.GMAIL: GmailConnector,
DocumentSource.GITLAB: GitlabConnector,
DocumentSource.GITBOOK: GitbookConnector,
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
DocumentSource.BOOKSTACK: BookstackConnector,
DocumentSource.CONFLUENCE: ConfluenceConnector,

View File

@ -0,0 +1,281 @@
import time
from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import urljoin
import requests
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.utils.logger import setup_logger
logger = setup_logger()
GITBOOK_API_BASE = "https://api.gitbook.com/v1/"
class GitbookApiClient:
def __init__(self, access_token: str) -> None:
self.access_token = access_token
def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any:
headers = {
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json",
}
url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/"))
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
return response.json()
def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]:
return self.get(f"/spaces/{space_id}/content/page/{page_id}")
def _extract_text_from_document(document: dict[str, Any]) -> str:
"""Extract text content from GitBook document structure by parsing the document nodes
into markdown format."""
def parse_leaf(leaf):
text = leaf.get("text", "")
leaf.get("marks", [])
return text
def parse_text_node(node):
text = ""
for leaf in node.get("leaves", []):
text += parse_leaf(leaf)
return text
def parse_block_node(node):
block_type = node.get("type", "")
result = ""
if block_type == "heading-1":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"# {text}\n\n"
elif block_type == "heading-2":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"## {text}\n\n"
elif block_type == "heading-3":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"### {text}\n\n"
elif block_type == "heading-4":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"#### {text}\n\n"
elif block_type == "heading-5":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"##### {text}\n\n"
elif block_type == "heading-6":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"###### {text}\n\n"
elif block_type == "list-unordered":
for list_item in node.get("nodes", []):
paragraph = list_item.get("nodes", [])[0]
text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
result += f"* {text}\n"
result += "\n"
elif block_type == "paragraph":
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
result = f"{text}\n\n"
elif block_type == "list-tasks":
for task_item in node.get("nodes", []):
checked = task_item.get("data", {}).get("checked", False)
paragraph = task_item.get("nodes", [])[0]
text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
checkbox = "[x]" if checked else "[ ]"
result += f"- {checkbox} {text}\n"
result += "\n"
elif block_type == "code":
for code_line in node.get("nodes", []):
if code_line.get("type") == "code-line":
text = "".join(
parse_text_node(n) for n in code_line.get("nodes", [])
)
result += f"{text}\n"
result += "\n"
elif block_type == "blockquote":
for quote_node in node.get("nodes", []):
if quote_node.get("type") == "paragraph":
text = "".join(
parse_text_node(n) for n in quote_node.get("nodes", [])
)
result += f"> {text}\n"
result += "\n"
elif block_type == "table":
records = node.get("data", {}).get("records", {})
definition = node.get("data", {}).get("definition", {})
view = node.get("data", {}).get("view", {})
columns = view.get("columns", [])
header_cells = []
for col_id in columns:
col_def = definition.get(col_id, {})
header_cells.append(col_def.get("title", ""))
result = "| " + " | ".join(header_cells) + " |\n"
result += "|" + "---|" * len(header_cells) + "\n"
sorted_records = sorted(
records.items(), key=lambda x: x[1].get("orderIndex", "")
)
for record_id, record_data in sorted_records:
values = record_data.get("values", {})
row_cells = []
for col_id in columns:
fragment_id = values.get(col_id, "")
fragment_text = ""
for fragment in node.get("fragments", []):
if fragment.get("fragment") == fragment_id:
for frag_node in fragment.get("nodes", []):
if frag_node.get("type") == "paragraph":
fragment_text = "".join(
parse_text_node(n)
for n in frag_node.get("nodes", [])
)
break
row_cells.append(fragment_text)
result += "| " + " | ".join(row_cells) + " |\n"
result += "\n"
return result
if not document or "document" not in document:
return ""
markdown = ""
nodes = document["document"].get("nodes", [])
for node in nodes:
markdown += parse_block_node(node)
return markdown
def _convert_page_to_document(
client: GitbookApiClient, space_id: str, page: dict[str, Any]
) -> Document:
page_id = page["id"]
page_content = client.get_page_content(space_id, page_id)
return Document(
id=f"gitbook-{space_id}-{page_id}",
sections=[
Section(
link=page.get("urls", {}).get("app", ""),
text=_extract_text_from_document(page_content),
)
],
source=DocumentSource.GITBOOK,
semantic_identifier=page.get("title", ""),
doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace(
tzinfo=timezone.utc
),
metadata={
"path": page.get("path", ""),
"type": page.get("type", ""),
"kind": page.get("kind", ""),
},
)
class GitbookConnector(LoadConnector, PollConnector):
def __init__(
self,
space_id: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.space_id = space_id
self.batch_size = batch_size
self.access_token: str | None = None
self.client: GitbookApiClient | None = None
def load_credentials(self, credentials: dict[str, Any]) -> None:
access_token = credentials.get("gitbook_api_key")
if not access_token:
raise ConnectorMissingCredentialError("GitBook access token")
self.access_token = access_token
self.client = GitbookApiClient(access_token)
def _fetch_all_pages(
self,
start: datetime | None = None,
end: datetime | None = None,
) -> GenerateDocumentsOutput:
if not self.client:
raise ConnectorMissingCredentialError("GitBook")
try:
content = self.client.get(f"/spaces/{self.space_id}/content")
pages = content.get("pages", [])
current_batch = []
for page in pages:
updated_at = datetime.fromisoformat(page["updatedAt"])
if start and updated_at < start:
if current_batch:
yield current_batch
return
if end and updated_at > end:
continue
current_batch.append(
_convert_page_to_document(self.client, self.space_id, page)
)
if len(current_batch) >= self.batch_size:
yield current_batch
current_batch = []
time.sleep(0.1) # Rate limiting
if current_batch:
yield current_batch
except requests.RequestException as e:
logger.error(f"Error fetching GitBook content: {str(e)}")
raise
def load_from_state(self) -> GenerateDocumentsOutput:
return self._fetch_all_pages()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
return self._fetch_all_pages(start_datetime, end_datetime)
if __name__ == "__main__":
import os
connector = GitbookConnector(
space_id=os.environ["GITBOOK_SPACE_ID"],
)
connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]})
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@ -0,0 +1,81 @@
import os
import time
import pytest
from onyx.configs.constants import DocumentSource
from onyx.connectors.gitbook.connector import GitbookConnector
@pytest.fixture
def gitbook_connector() -> GitbookConnector:
connector = GitbookConnector(
space_id=os.environ["GITBOOK_SPACE_ID"],
)
connector.load_credentials(
{
"gitbook_api_key": os.environ["GITBOOK_API_KEY"],
}
)
return connector
def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None:
doc_batch_generator = gitbook_connector.load_from_state()
# Get first batch of documents
doc_batch = next(doc_batch_generator)
assert len(doc_batch) > 0
# Verify first document structure
doc = doc_batch[0]
# Basic document properties
assert doc.id.startswith("gitbook-")
assert doc.semantic_identifier == "Acme Corp Internal Handbook"
assert doc.source == DocumentSource.GITBOOK
# Metadata checks
assert "path" in doc.metadata
assert "type" in doc.metadata
assert "kind" in doc.metadata
# Section checks
assert len(doc.sections) == 1
section = doc.sections[0]
# Content specific checks
content = section.text
# Check for specific content elements
assert "* Fruit Shopping List:" in content
assert "> test quote it doesn't mean anything" in content
# Check headings
assert "# Heading 1" in content
assert "## Heading 2" in content
assert "### Heading 3" in content
# Check task list
assert "- [ ] Uncompleted Task" in content
assert "- [x] Completed Task" in content
# Check table content
assert "| ethereum | 10 | 3000 |" in content
assert "| bitcoin | 2 | 98000 |" in content
# Check paragraph content
assert "New York City comprises 5 boroughs" in content
assert "Empire State Building" in content
# Check code block (just verify presence of some unique code elements)
assert "function fizzBuzz(n)" in content
assert 'res.push("FizzBuzz")' in content
assert section.link # Should have a URL
# Time-based polling test
current_time = time.time()
poll_docs = gitbook_connector.poll_source(0, current_time)
poll_batch = next(poll_docs)
assert len(poll_batch) > 0

View File

@ -2889,6 +2889,24 @@ export const AirtableIcon = ({
return <LogoIcon size={size} className={className} src={airtableIcon} />;
};
export const GitbookIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<svg
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 65 65"
fill="none"
>
<path
d="M27.3964 34.2196C30.5255 36.0256 32.09 36.9286 33.8083 36.9301C35.5265 36.9316 37.0926 36.0313 40.2249 34.2308L60.1914 22.7535C61.0927 22.2354 61.6484 21.275 61.6484 20.2353C61.6484 19.1956 61.0927 18.2352 60.1914 17.7171L40.2177 6.2356C37.0888 4.43701 35.5243 3.53772 33.8078 3.53839C32.0912 3.53906 30.5275 4.43957 27.4 6.24059L10.2293 16.1286C10.102 16.2019 10.0384 16.2386 9.97908 16.2733C4.11371 19.7069 0.489892 25.9755 0.441438 32.7718C0.440948 32.8405 0.440948 32.9139 0.440948 33.0608C0.440948 33.2074 0.440948 33.2808 0.441437 33.3494C0.489785 40.1381 4.10552 46.4008 9.96044 49.8371C10.0196 49.8718 10.0832 49.9085 10.2102 49.9819L20.9659 56.1919C27.2332 59.8104 30.3668 61.6197 33.8081 61.6209C37.2493 61.622 40.3842 59.8149 46.6539 56.2005L58.008 49.6552C61.1474 47.8454 62.7171 46.9406 63.579 45.4488C64.4409 43.957 64.4409 42.1452 64.4409 38.5215V31.5212C64.4409 30.516 63.8965 29.5896 63.0182 29.1004C62.1684 28.6271 61.1325 28.6341 60.2891 29.1189L37.0074 42.5019C35.4454 43.3998 34.6643 43.8488 33.8073 43.8491C32.9502 43.8493 32.1689 43.4008 30.6063 42.5039L14.8487 33.4587C14.0594 33.0056 13.6647 32.779 13.3477 32.7381C12.625 32.6448 11.9301 33.0497 11.6548 33.7244C11.5341 34.0203 11.5365 34.4754 11.5414 35.3855C11.545 36.0555 11.5468 36.3905 11.6094 36.6987C11.7497 37.3888 12.1127 38.0136 12.6428 38.4772C12.8795 38.6842 13.1696 38.8517 13.75 39.1866L30.5974 48.9103C32.1641 49.8145 32.9474 50.2666 33.8075 50.2669C34.6677 50.2671 35.4513 49.8154 37.0184 48.9121L57.6684 37.0086C58.2037 36.7 58.4714 36.5457 58.6721 36.6617C58.8727 36.7777 58.8727 37.0866 58.8727 37.7045V40.8796C58.8727 41.7856 58.8727 42.2385 58.6572 42.6115C58.4418 42.9844 58.0493 43.2106 57.2644 43.6631L40.2322 53.4811C37.0966 55.2886 35.5288 56.1923 33.8079 56.1915C32.0869 56.1907 30.5199 55.2856 27.386 53.4752L11.4509 44.2702C11.4003 44.2409 11.375 44.2263 11.3514 44.2125C8.01023 42.2601 5.94859 38.6883 5.92925 34.8185C5.92912 34.7912 5.92912 34.762 5.92912 34.7035V31.7889C5.92912 29.6526 7.06689 27.678 8.91513 26.6067C10.5483 25.6601 12.5628 25.6582 14.1977 26.6018L27.3964 34.2196Z"
fill="#F2F7F7"
/>
</svg>
);
export const PinnedIcon = ({
size = 16,
className = defaultTailwindCSS,

View File

@ -227,6 +227,20 @@ export const connectorConfigs: Record<
],
advanced_values: [],
},
gitbook: {
description: "Configure GitBook connector",
values: [
{
type: "text",
query: "Enter the space ID:",
label: "Space ID",
name: "space_id",
optional: false,
description: "The ID of the GitBook space to index.",
},
],
advanced_values: [],
},
google_drive: {
description: "Configure Google Drive connector",
values: [

View File

@ -30,6 +30,11 @@ export interface GithubCredentialJson {
github_access_token: string;
}
export interface GitbookCredentialJson {
gitbook_space_id: string;
gitbook_api_key: string;
}
export interface GitlabCredentialJson {
gitlab_url: string;
gitlab_access_token: string;
@ -344,6 +349,10 @@ export const credentialTemplates: Record<ValidSources, any> = {
// NOTE: These are Special Cases
google_drive: { google_tokens: "" } as GoogleDriveCredentialJson,
gmail: { google_tokens: "" } as GmailCredentialJson,
gitbook: {
gitbook_space_id: "",
gitbook_api_key: "",
} as GitbookCredentialJson,
};
export const credentialDisplayNames: Record<string, string> = {
@ -474,6 +483,10 @@ export const credentialDisplayNames: Record<string, string> = {
// Fireflies
fireflies_api_key: "Fireflies API Key",
// GitBook
gitbook_space_id: "GitBook Space ID",
gitbook_api_key: "GitBook API Key",
};
export function getDisplayNameForCredentialKey(key: string): string {

View File

@ -43,6 +43,7 @@ import {
AirtableIcon,
GlobeIcon2,
FileIcon2,
GitbookIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
@ -321,6 +322,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Other,
docs: "https://docs.onyx.app/connectors/airtable",
},
gitbook: {
icon: GitbookIcon,
displayName: "GitBook",
category: SourceCategory.CodeRepository,
docs: "https://docs.onyx.app/connectors/gitbook",
},
// currently used for the Internet Search tool docs, which is why
// a globe is used
not_applicable: {

View File

@ -350,6 +350,7 @@ export enum ValidSources {
Fireflies = "fireflies",
Egnyte = "egnyte",
Airtable = "airtable",
Gitbook = "gitbook",
}
export const validAutoSyncSources = [