diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index 38b0b4417554..bef30d7ca9d2 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -74,7 +74,9 @@ jobs: python -m pip install --upgrade pip pip install --retries 5 --timeout 30 -r backend/requirements/default.txt pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt - + playwright install chromium + playwright install-deps chromium + - name: Run Tests shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}" run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index f15632b10379..3caef66f66bf 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT logger = setup_logger() +WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20 + class WEB_CONNECTOR_VALID_SETTINGS(str, Enum): # Given a base site, index everything under that path @@ -225,10 +227,13 @@ class WebConnector(LoadConnector): web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value, mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well batch_size: int = INDEX_BATCH_SIZE, + scroll_before_scraping: bool = False, + **kwargs: Any, ) -> None: self.mintlify_cleanup = mintlify_cleanup self.batch_size = batch_size self.recursive = False + self.scroll_before_scraping = scroll_before_scraping if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: self.recursive = True @@ -344,6 +349,18 @@ class WebConnector(LoadConnector): continue visited_links.add(current_url) + if self.scroll_before_scraping: + scroll_attempts = 0 + previous_height = page.evaluate("document.body.scrollHeight") + while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: + page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + page.wait_for_load_state("networkidle", timeout=30000) + new_height = page.evaluate("document.body.scrollHeight") + if new_height == previous_height: + break # Stop scrolling when no more content is loaded + previous_height = new_height + scroll_attempts += 1 + content = page.content() soup = BeautifulSoup(content, "html.parser") diff --git a/backend/tests/daily/connectors/web/test_web_connector.py b/backend/tests/daily/connectors/web/test_web_connector.py new file mode 100644 index 000000000000..997f93858981 --- /dev/null +++ b/backend/tests/daily/connectors/web/test_web_connector.py @@ -0,0 +1,44 @@ +import pytest + +from onyx.connectors.models import Document +from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS +from onyx.connectors.web.connector import WebConnector + + +# NOTE(rkuo): we will probably need to adjust this test to point at our own test site +# to avoid depending on a third party site +@pytest.fixture +def web_connector(request: pytest.FixtureRequest) -> WebConnector: + scroll_before_scraping = request.param + connector = WebConnector( + base_url="https://developer.onewelcome.com", + web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value, + scroll_before_scraping=scroll_before_scraping, + ) + return connector + + +@pytest.mark.parametrize("web_connector", [True], indirect=True) +def test_web_connector_scroll(web_connector: WebConnector) -> None: + all_docs: list[Document] = [] + document_batches = web_connector.load_from_state() + for doc_batch in document_batches: + for doc in doc_batch: + all_docs.append(doc) + + assert len(all_docs) == 1 + doc = all_docs[0] + assert "Onegini Identity Cloud" in doc.sections[0].text + + +@pytest.mark.parametrize("web_connector", [False], indirect=True) +def test_web_connector_no_scroll(web_connector: WebConnector) -> None: + all_docs: list[Document] = [] + document_batches = web_connector.load_from_state() + for doc_batch in document_batches: + for doc in doc_batch: + all_docs.append(doc) + + assert len(all_docs) == 1 + doc = all_docs[0] + assert "Onegini Identity Cloud" not in doc.sections[0].text diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index e38819a36d18..cd5b17fed68d 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -152,7 +152,17 @@ export const connectorConfigs: Record< ], }, ], - advanced_values: [], + advanced_values: [ + { + type: "checkbox", + query: "Scroll before scraping:", + label: "Scroll before scraping", + description: + "Enable if the website requires scrolling for the desired content to load", + name: "scroll_before_scraping", + optional: true, + }, + ], overrideDefaultFreq: 60 * 60 * 24, }, github: {