This commit is contained in:
Weves
2025-02-13 18:00:51 -08:00
committed by Chris Weaver
parent 7f4d1f27a0
commit c2f3302aa0

View File

@@ -1,4 +1,3 @@
import time
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from typing import Any from typing import Any
@@ -46,18 +45,18 @@ def _extract_text_from_document(document: dict[str, Any]) -> str:
"""Extract text content from GitBook document structure by parsing the document nodes """Extract text content from GitBook document structure by parsing the document nodes
into markdown format.""" into markdown format."""
def parse_leaf(leaf): def parse_leaf(leaf: dict[str, Any]) -> str:
text = leaf.get("text", "") text = leaf.get("text", "")
leaf.get("marks", []) leaf.get("marks", [])
return text return text
def parse_text_node(node): def parse_text_node(node: dict[str, Any]) -> str:
text = "" text = ""
for leaf in node.get("leaves", []): for leaf in node.get("leaves", []):
text += parse_leaf(leaf) text += parse_leaf(leaf)
return text return text
def parse_block_node(node): def parse_block_node(node: dict[str, Any]) -> str:
block_type = node.get("type", "") block_type = node.get("type", "")
result = "" result = ""
@@ -232,7 +231,7 @@ class GitbookConnector(LoadConnector, PollConnector):
content = self.client.get(f"/spaces/{self.space_id}/content") content = self.client.get(f"/spaces/{self.space_id}/content")
pages = content.get("pages", []) pages = content.get("pages", [])
current_batch = [] current_batch: list[Document] = []
for page in pages: for page in pages:
updated_at = datetime.fromisoformat(page["updatedAt"]) updated_at = datetime.fromisoformat(page["updatedAt"])
@@ -250,7 +249,6 @@ class GitbookConnector(LoadConnector, PollConnector):
if len(current_batch) >= self.batch_size: if len(current_batch) >= self.batch_size:
yield current_batch yield current_batch
current_batch = [] current_batch = []
time.sleep(0.1) # Rate limiting
if current_batch: if current_batch:
yield current_batch yield current_batch