support scrolling before scraping (#4040)

* support scrolling before scraping * fix mypy * install playwright deps --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
2025-09-27 20:38:32 +02:00 · 2025-02-19 09:54:58 -08:00
parent 11f6b44625
commit c9f618798e
4 changed files with 75 additions and 2 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -74,7 +74,9 @@ jobs:
          python -m pip install --upgrade pip
          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
-
+          playwright install chromium
          playwright install-deps chromium
      - name: Run Tests
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT
 logger = setup_logger()
 WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
    # Given a base site, index everything under that path
@@ -225,10 +227,13 @@ class WebConnector(LoadConnector):
        web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
        mintlify_cleanup: bool = True,  # Mostly ok to apply to other websites as well
        batch_size: int = INDEX_BATCH_SIZE,
        scroll_before_scraping: bool = False,
        **kwargs: Any,
    ) -> None:
        self.mintlify_cleanup = mintlify_cleanup
        self.batch_size = batch_size
        self.recursive = False
        self.scroll_before_scraping = scroll_before_scraping
        if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
            self.recursive = True
@@ -344,6 +349,18 @@ class WebConnector(LoadConnector):
                        continue
                    visited_links.add(current_url)
                if self.scroll_before_scraping:
                    scroll_attempts = 0
                    previous_height = page.evaluate("document.body.scrollHeight")
                    while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
                        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                        page.wait_for_load_state("networkidle", timeout=30000)
                        new_height = page.evaluate("document.body.scrollHeight")
                        if new_height == previous_height:
                            break  # Stop scrolling when no more content is loaded
                        previous_height = new_height
                        scroll_attempts += 1
                content = page.content()
                soup = BeautifulSoup(content, "html.parser")
--- a/backend/tests/daily/connectors/web/test_web_connector.py
+++ b/backend/tests/daily/connectors/web/test_web_connector.py
@@ -0,0 +1,44 @@
 import pytest
 from onyx.connectors.models import Document
 from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
 from onyx.connectors.web.connector import WebConnector
 # NOTE(rkuo): we will probably need to adjust this test to point at our own test site
 # to avoid depending on a third party site
@pytest.fixture
 def web_connector(request: pytest.FixtureRequest) -> WebConnector:
    scroll_before_scraping = request.param
    connector = WebConnector(
        base_url="https://developer.onewelcome.com",
        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
        scroll_before_scraping=scroll_before_scraping,
    )
    return connector
@pytest.mark.parametrize("web_connector", [True], indirect=True)
 def test_web_connector_scroll(web_connector: WebConnector) -> None:
    all_docs: list[Document] = []
    document_batches = web_connector.load_from_state()
    for doc_batch in document_batches:
        for doc in doc_batch:
            all_docs.append(doc)
    assert len(all_docs) == 1
    doc = all_docs[0]
    assert "Onegini Identity Cloud" in doc.sections[0].text
@pytest.mark.parametrize("web_connector", [False], indirect=True)
 def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
    all_docs: list[Document] = []
    document_batches = web_connector.load_from_state()
    for doc_batch in document_batches:
        for doc in doc_batch:
            all_docs.append(doc)
    assert len(all_docs) == 1
    doc = all_docs[0]
    assert "Onegini Identity Cloud" not in doc.sections[0].text
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
        ],
      },
    ],
-    advanced_values: [],
+    advanced_values: [
      {
        type: "checkbox",
        query: "Scroll before scraping:",
        label: "Scroll before scraping",
        description:
          "Enable if the website requires scrolling for the desired content to load",
        name: "scroll_before_scraping",
        optional: true,
      },
    ],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  github: {