support scrolling before scraping (#4040)

* support scrolling before scraping

* fix mypy

* install playwright deps

---------

Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
rkuo-danswer
2025-02-19 09:54:58 -08:00
committed by GitHub
parent 11f6b44625
commit c9f618798e
4 changed files with 75 additions and 2 deletions

View File

@@ -74,7 +74,9 @@ jobs:
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
playwright install chromium
playwright install-deps chromium
- name: Run Tests - name: Run Tests
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}" shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors

View File

@@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT
logger = setup_logger() logger = setup_logger()
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum): class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
# Given a base site, index everything under that path # Given a base site, index everything under that path
@@ -225,10 +227,13 @@ class WebConnector(LoadConnector):
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value, web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
scroll_before_scraping: bool = False,
**kwargs: Any,
) -> None: ) -> None:
self.mintlify_cleanup = mintlify_cleanup self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size self.batch_size = batch_size
self.recursive = False self.recursive = False
self.scroll_before_scraping = scroll_before_scraping
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
self.recursive = True self.recursive = True
@@ -344,6 +349,18 @@ class WebConnector(LoadConnector):
continue continue
visited_links.add(current_url) visited_links.add(current_url)
if self.scroll_before_scraping:
scroll_attempts = 0
previous_height = page.evaluate("document.body.scrollHeight")
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_load_state("networkidle", timeout=30000)
new_height = page.evaluate("document.body.scrollHeight")
if new_height == previous_height:
break # Stop scrolling when no more content is loaded
previous_height = new_height
scroll_attempts += 1
content = page.content() content = page.content()
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")

View File

@@ -0,0 +1,44 @@
import pytest
from onyx.connectors.models import Document
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
from onyx.connectors.web.connector import WebConnector
# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
# to avoid depending on a third party site
@pytest.fixture
def web_connector(request: pytest.FixtureRequest) -> WebConnector:
scroll_before_scraping = request.param
connector = WebConnector(
base_url="https://developer.onewelcome.com",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
scroll_before_scraping=scroll_before_scraping,
)
return connector
@pytest.mark.parametrize("web_connector", [True], indirect=True)
def test_web_connector_scroll(web_connector: WebConnector) -> None:
all_docs: list[Document] = []
document_batches = web_connector.load_from_state()
for doc_batch in document_batches:
for doc in doc_batch:
all_docs.append(doc)
assert len(all_docs) == 1
doc = all_docs[0]
assert "Onegini Identity Cloud" in doc.sections[0].text
@pytest.mark.parametrize("web_connector", [False], indirect=True)
def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
all_docs: list[Document] = []
document_batches = web_connector.load_from_state()
for doc_batch in document_batches:
for doc in doc_batch:
all_docs.append(doc)
assert len(all_docs) == 1
doc = all_docs[0]
assert "Onegini Identity Cloud" not in doc.sections[0].text

View File

@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
], ],
}, },
], ],
advanced_values: [], advanced_values: [
{
type: "checkbox",
query: "Scroll before scraping:",
label: "Scroll before scraping",
description:
"Enable if the website requires scrolling for the desired content to load",
name: "scroll_before_scraping",
optional: true,
},
],
overrideDefaultFreq: 60 * 60 * 24, overrideDefaultFreq: 60 * 60 * 24,
}, },
github: { github: {