support scrolling before scraping (#4040)

* support scrolling before scraping * fix mypy * install playwright deps --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
2025-09-28 21:05:17 +02:00 · 2025-02-19 09:54:58 -08:00
parent 11f6b44625
commit c9f618798e
4 changed files with 75 additions and 2 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -74,6 +74,8 @@ jobs:
          python -m pip install --upgrade pip
          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          playwright install chromium
+          playwright install-deps chromium
          
      - name: Run Tests
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()

+WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
+

 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
    # Given a base site, index everything under that path
@@ -225,10 +227,13 @@ class WebConnector(LoadConnector):
        web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
        mintlify_cleanup: bool = True,  # Mostly ok to apply to other websites as well
        batch_size: int = INDEX_BATCH_SIZE,
+        scroll_before_scraping: bool = False,
+        **kwargs: Any,
    ) -> None:
        self.mintlify_cleanup = mintlify_cleanup
        self.batch_size = batch_size
        self.recursive = False
+        self.scroll_before_scraping = scroll_before_scraping

        if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
            self.recursive = True
@@ -344,6 +349,18 @@ class WebConnector(LoadConnector):
                        continue
                    visited_links.add(current_url)

+                if self.scroll_before_scraping:
+                    scroll_attempts = 0
+                    previous_height = page.evaluate("document.body.scrollHeight")
+                    while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
+                        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                        page.wait_for_load_state("networkidle", timeout=30000)
+                        new_height = page.evaluate("document.body.scrollHeight")
+                        if new_height == previous_height:
+                            break  # Stop scrolling when no more content is loaded
+                        previous_height = new_height
+                        scroll_attempts += 1
+
                content = page.content()
                soup = BeautifulSoup(content, "html.parser")

--- a/backend/tests/daily/connectors/web/test_web_connector.py
+++ b/backend/tests/daily/connectors/web/test_web_connector.py
@@ -0,0 +1,44 @@
+import pytest
+
+from onyx.connectors.models import Document
+from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
+from onyx.connectors.web.connector import WebConnector
+
+
+# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
+# to avoid depending on a third party site
+@pytest.fixture
+def web_connector(request: pytest.FixtureRequest) -> WebConnector:
+    scroll_before_scraping = request.param
+    connector = WebConnector(
+        base_url="https://developer.onewelcome.com",
+        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
+        scroll_before_scraping=scroll_before_scraping,
+    )
+    return connector
+
+
+@pytest.mark.parametrize("web_connector", [True], indirect=True)
+def test_web_connector_scroll(web_connector: WebConnector) -> None:
+    all_docs: list[Document] = []
+    document_batches = web_connector.load_from_state()
+    for doc_batch in document_batches:
+        for doc in doc_batch:
+            all_docs.append(doc)
+
+    assert len(all_docs) == 1
+    doc = all_docs[0]
+    assert "Onegini Identity Cloud" in doc.sections[0].text
+
+
+@pytest.mark.parametrize("web_connector", [False], indirect=True)
+def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
+    all_docs: list[Document] = []
+    document_batches = web_connector.load_from_state()
+    for doc_batch in document_batches:
+        for doc in doc_batch:
+            all_docs.append(doc)
+
+    assert len(all_docs) == 1
+    doc = all_docs[0]
+    assert "Onegini Identity Cloud" not in doc.sections[0].text
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
        ],
      },
    ],
-    advanced_values: [],
+    advanced_values: [
+      {
+        type: "checkbox",
+        query: "Scroll before scraping:",
+        label: "Scroll before scraping",
+        description:
+          "Enable if the website requires scrolling for the desired content to load",
+        name: "scroll_before_scraping",
+        optional: true,
+      },
+    ],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  github: {