From 7f7621d7c0a3c482c3d3827d76a6f90d0a026233 Mon Sep 17 00:00:00 2001 From: Weves Date: Sun, 9 Mar 2025 14:08:00 -0700 Subject: [PATCH] SMall gitbook tweaks --- backend/onyx/connectors/gitbook/connector.py | 7 +++- .../gitbook/test_gitbook_connector.py | 40 ++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/backend/onyx/connectors/gitbook/connector.py b/backend/onyx/connectors/gitbook/connector.py index fb76427e8..26f8e1aa3 100644 --- a/backend/onyx/connectors/gitbook/connector.py +++ b/backend/onyx/connectors/gitbook/connector.py @@ -228,10 +228,15 @@ class GitbookConnector(LoadConnector, PollConnector): raise ConnectorMissingCredentialError("GitBook") try: - content = self.client.get(f"/spaces/{self.space_id}/content") + content = self.client.get(f"/spaces/{self.space_id}/content/pages") pages: list[dict[str, Any]] = content.get("pages", []) current_batch: list[Document] = [] + logger.info(f"Found {len(pages)} root pages.") + logger.info( + f"First 20 Page Ids: {[page.get('id', 'Unknown') for page in pages[:20]]}" + ) + while pages: page = pages.pop(0) diff --git a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py index 4d3b0cefb..7c998c4f1 100644 --- a/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py +++ b/backend/tests/daily/connectors/gitbook/test_gitbook_connector.py @@ -20,29 +20,32 @@ def gitbook_connector() -> GitbookConnector: return connector +NUM_PAGES = 3 + + def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: doc_batch_generator = gitbook_connector.load_from_state() # Get first batch of documents doc_batch = next(doc_batch_generator) - assert len(doc_batch) > 0 + assert len(doc_batch) == NUM_PAGES # Verify first document structure - doc = doc_batch[0] + main_doc = doc_batch[0] # Basic document properties - assert doc.id.startswith("gitbook-") - assert doc.semantic_identifier == "Acme Corp Internal Handbook" - assert doc.source == DocumentSource.GITBOOK + assert main_doc.id.startswith("gitbook-") + assert main_doc.semantic_identifier == "Acme Corp Internal Handbook" + assert main_doc.source == DocumentSource.GITBOOK # Metadata checks - assert "path" in doc.metadata - assert "type" in doc.metadata - assert "kind" in doc.metadata + assert "path" in main_doc.metadata + assert "type" in main_doc.metadata + assert "kind" in main_doc.metadata # Section checks - assert len(doc.sections) == 1 - section = doc.sections[0] + assert len(main_doc.sections) == 1 + section = main_doc.sections[0] # Content specific checks content = section.text @@ -74,8 +77,23 @@ def test_gitbook_connector_basic(gitbook_connector: GitbookConnector) -> None: assert section.link # Should have a URL + nested1 = doc_batch[1] + assert nested1.id.startswith("gitbook-") + assert nested1.semantic_identifier == "Nested1" + assert len(nested1.sections) == 1 + # extra newlines at the end, remove them to make test easier + assert nested1.sections[0].text.strip() == "nested1" + assert nested1.source == DocumentSource.GITBOOK + + nested2 = doc_batch[2] + assert nested2.id.startswith("gitbook-") + assert nested2.semantic_identifier == "Nested2" + assert len(nested2.sections) == 1 + assert nested2.sections[0].text.strip() == "nested2" + assert nested2.source == DocumentSource.GITBOOK + # Time-based polling test current_time = time.time() poll_docs = gitbook_connector.poll_source(0, current_time) poll_batch = next(poll_docs) - assert len(poll_batch) > 0 + assert len(poll_batch) == NUM_PAGES