mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-13 05:10:13 +02:00
291 lines
10 KiB
Python
291 lines
10 KiB
Python
from datetime import datetime
|
|
from datetime import timezone
|
|
from typing import Any
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
|
|
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
|
from onyx.connectors.interfaces import LoadConnector
|
|
from onyx.connectors.interfaces import PollConnector
|
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
|
from onyx.connectors.models import ConnectorMissingCredentialError
|
|
from onyx.connectors.models import Document
|
|
from onyx.connectors.models import TextSection
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
|
|
logger = setup_logger()
|
|
|
|
GITBOOK_API_BASE = "https://api.gitbook.com/v1/"
|
|
|
|
|
|
class GitbookApiClient:
|
|
def __init__(self, access_token: str) -> None:
|
|
self.access_token = access_token
|
|
|
|
def get(self, endpoint: str, params: dict[str, Any] | None = None) -> Any:
|
|
headers = {
|
|
"Authorization": f"Bearer {self.access_token}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
url = urljoin(GITBOOK_API_BASE, endpoint.lstrip("/"))
|
|
response = requests.get(url, headers=headers, params=params)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def get_page_content(self, space_id: str, page_id: str) -> dict[str, Any]:
|
|
return self.get(f"/spaces/{space_id}/content/page/{page_id}")
|
|
|
|
|
|
def _extract_text_from_document(document: dict[str, Any]) -> str:
|
|
"""Extract text content from GitBook document structure by parsing the document nodes
|
|
into markdown format."""
|
|
|
|
def parse_leaf(leaf: dict[str, Any]) -> str:
|
|
text = leaf.get("text", "")
|
|
leaf.get("marks", [])
|
|
return text
|
|
|
|
def parse_text_node(node: dict[str, Any]) -> str:
|
|
text = ""
|
|
for leaf in node.get("leaves", []):
|
|
text += parse_leaf(leaf)
|
|
return text
|
|
|
|
def parse_block_node(node: dict[str, Any]) -> str:
|
|
block_type = node.get("type", "")
|
|
result = ""
|
|
|
|
if block_type == "heading-1":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"# {text}\n\n"
|
|
|
|
elif block_type == "heading-2":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"## {text}\n\n"
|
|
|
|
elif block_type == "heading-3":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"### {text}\n\n"
|
|
|
|
elif block_type == "heading-4":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"#### {text}\n\n"
|
|
|
|
elif block_type == "heading-5":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"##### {text}\n\n"
|
|
|
|
elif block_type == "heading-6":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"###### {text}\n\n"
|
|
|
|
elif block_type == "list-unordered":
|
|
for list_item in node.get("nodes", []):
|
|
paragraph = list_item.get("nodes", [])[0]
|
|
text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
|
|
result += f"* {text}\n"
|
|
result += "\n"
|
|
|
|
elif block_type == "paragraph":
|
|
text = "".join(parse_text_node(n) for n in node.get("nodes", []))
|
|
result = f"{text}\n\n"
|
|
|
|
elif block_type == "list-tasks":
|
|
for task_item in node.get("nodes", []):
|
|
checked = task_item.get("data", {}).get("checked", False)
|
|
paragraph = task_item.get("nodes", [])[0]
|
|
text = "".join(parse_text_node(n) for n in paragraph.get("nodes", []))
|
|
checkbox = "[x]" if checked else "[ ]"
|
|
result += f"- {checkbox} {text}\n"
|
|
result += "\n"
|
|
|
|
elif block_type == "code":
|
|
for code_line in node.get("nodes", []):
|
|
if code_line.get("type") == "code-line":
|
|
text = "".join(
|
|
parse_text_node(n) for n in code_line.get("nodes", [])
|
|
)
|
|
result += f"{text}\n"
|
|
result += "\n"
|
|
|
|
elif block_type == "blockquote":
|
|
for quote_node in node.get("nodes", []):
|
|
if quote_node.get("type") == "paragraph":
|
|
text = "".join(
|
|
parse_text_node(n) for n in quote_node.get("nodes", [])
|
|
)
|
|
result += f"> {text}\n"
|
|
result += "\n"
|
|
|
|
elif block_type == "table":
|
|
records = node.get("data", {}).get("records", {})
|
|
definition = node.get("data", {}).get("definition", {})
|
|
view = node.get("data", {}).get("view", {})
|
|
|
|
columns = view.get("columns", [])
|
|
|
|
header_cells = []
|
|
for col_id in columns:
|
|
col_def = definition.get(col_id, {})
|
|
header_cells.append(col_def.get("title", ""))
|
|
|
|
result = "| " + " | ".join(header_cells) + " |\n"
|
|
result += "|" + "---|" * len(header_cells) + "\n"
|
|
|
|
sorted_records = sorted(
|
|
records.items(), key=lambda x: x[1].get("orderIndex", "")
|
|
)
|
|
|
|
for record_id, record_data in sorted_records:
|
|
values = record_data.get("values", {})
|
|
row_cells = []
|
|
for col_id in columns:
|
|
fragment_id = values.get(col_id, "")
|
|
fragment_text = ""
|
|
for fragment in node.get("fragments", []):
|
|
if fragment.get("fragment") == fragment_id:
|
|
for frag_node in fragment.get("nodes", []):
|
|
if frag_node.get("type") == "paragraph":
|
|
fragment_text = "".join(
|
|
parse_text_node(n)
|
|
for n in frag_node.get("nodes", [])
|
|
)
|
|
break
|
|
row_cells.append(fragment_text)
|
|
result += "| " + " | ".join(row_cells) + " |\n"
|
|
|
|
result += "\n"
|
|
return result
|
|
|
|
if not document or "document" not in document:
|
|
return ""
|
|
|
|
markdown = ""
|
|
nodes = document["document"].get("nodes", [])
|
|
|
|
for node in nodes:
|
|
markdown += parse_block_node(node)
|
|
|
|
return markdown
|
|
|
|
|
|
def _convert_page_to_document(
|
|
client: GitbookApiClient, space_id: str, page: dict[str, Any]
|
|
) -> Document:
|
|
page_id = page["id"]
|
|
page_content = client.get_page_content(space_id, page_id)
|
|
|
|
return Document(
|
|
id=f"gitbook-{space_id}-{page_id}",
|
|
sections=[
|
|
TextSection(
|
|
link=page.get("urls", {}).get("app", ""),
|
|
text=_extract_text_from_document(page_content),
|
|
)
|
|
],
|
|
source=DocumentSource.GITBOOK,
|
|
semantic_identifier=page.get("title", ""),
|
|
doc_updated_at=datetime.fromisoformat(page["updatedAt"]).replace(
|
|
tzinfo=timezone.utc
|
|
),
|
|
metadata={
|
|
"path": page.get("path", ""),
|
|
"type": page.get("type", ""),
|
|
"kind": page.get("kind", ""),
|
|
},
|
|
)
|
|
|
|
|
|
class GitbookConnector(LoadConnector, PollConnector):
|
|
def __init__(
|
|
self,
|
|
space_id: str,
|
|
batch_size: int = INDEX_BATCH_SIZE,
|
|
) -> None:
|
|
self.space_id = space_id
|
|
self.batch_size = batch_size
|
|
self.access_token: str | None = None
|
|
self.client: GitbookApiClient | None = None
|
|
|
|
def load_credentials(self, credentials: dict[str, Any]) -> None:
|
|
access_token = credentials.get("gitbook_api_key")
|
|
if not access_token:
|
|
raise ConnectorMissingCredentialError("GitBook access token")
|
|
self.access_token = access_token
|
|
self.client = GitbookApiClient(access_token)
|
|
|
|
def _fetch_all_pages(
|
|
self,
|
|
start: datetime | None = None,
|
|
end: datetime | None = None,
|
|
) -> GenerateDocumentsOutput:
|
|
if not self.client:
|
|
raise ConnectorMissingCredentialError("GitBook")
|
|
|
|
try:
|
|
content = self.client.get(f"/spaces/{self.space_id}/content/pages")
|
|
pages: list[dict[str, Any]] = content.get("pages", [])
|
|
current_batch: list[Document] = []
|
|
|
|
logger.info(f"Found {len(pages)} root pages.")
|
|
logger.info(
|
|
f"First 20 Page Ids: {[page.get('id', 'Unknown') for page in pages[:20]]}"
|
|
)
|
|
|
|
while pages:
|
|
page = pages.pop(0)
|
|
|
|
updated_at_raw = page.get("updatedAt")
|
|
if updated_at_raw is None:
|
|
# if updatedAt is not present, that means the page has never been edited
|
|
continue
|
|
|
|
updated_at = datetime.fromisoformat(updated_at_raw)
|
|
if start and updated_at < start:
|
|
continue
|
|
if end and updated_at > end:
|
|
continue
|
|
|
|
current_batch.append(
|
|
_convert_page_to_document(self.client, self.space_id, page)
|
|
)
|
|
|
|
if len(current_batch) >= self.batch_size:
|
|
yield current_batch
|
|
current_batch = []
|
|
|
|
pages.extend(page.get("pages", []))
|
|
|
|
if current_batch:
|
|
yield current_batch
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Error fetching GitBook content: {str(e)}")
|
|
raise
|
|
|
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
return self._fetch_all_pages()
|
|
|
|
def poll_source(
|
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
|
) -> GenerateDocumentsOutput:
|
|
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
|
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
|
return self._fetch_all_pages(start_datetime, end_datetime)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import os
|
|
|
|
connector = GitbookConnector(
|
|
space_id=os.environ["GITBOOK_SPACE_ID"],
|
|
)
|
|
connector.load_credentials({"gitbook_api_key": os.environ["GITBOOK_API_KEY"]})
|
|
document_batches = connector.load_from_state()
|
|
print(next(document_batches))
|