mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 21:05:17 +02:00
support scrolling before scraping (#4040)
* support scrolling before scraping * fix mypy * install playwright deps --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
@@ -74,6 +74,8 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Run Tests
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
|
@@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
||||
|
||||
|
||||
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
||||
# Given a base site, index everything under that path
|
||||
@@ -225,10 +227,13 @@ class WebConnector(LoadConnector):
|
||||
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
||||
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
scroll_before_scraping: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self.mintlify_cleanup = mintlify_cleanup
|
||||
self.batch_size = batch_size
|
||||
self.recursive = False
|
||||
self.scroll_before_scraping = scroll_before_scraping
|
||||
|
||||
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
||||
self.recursive = True
|
||||
@@ -344,6 +349,18 @@ class WebConnector(LoadConnector):
|
||||
continue
|
||||
visited_links.add(current_url)
|
||||
|
||||
if self.scroll_before_scraping:
|
||||
scroll_attempts = 0
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
new_height = page.evaluate("document.body.scrollHeight")
|
||||
if new_height == previous_height:
|
||||
break # Stop scrolling when no more content is loaded
|
||||
previous_height = new_height
|
||||
scroll_attempts += 1
|
||||
|
||||
content = page.content()
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
|
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
|
||||
from onyx.connectors.web.connector import WebConnector
|
||||
|
||||
|
||||
# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
|
||||
# to avoid depending on a third party site
|
||||
@pytest.fixture
|
||||
def web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
||||
scroll_before_scraping = request.param
|
||||
connector = WebConnector(
|
||||
base_url="https://developer.onewelcome.com",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
scroll_before_scraping=scroll_before_scraping,
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [True], indirect=True)
|
||||
def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert "Onegini Identity Cloud" in doc.sections[0].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [False], indirect=True)
|
||||
def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert "Onegini Identity Cloud" not in doc.sections[0].text
|
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
|
||||
],
|
||||
},
|
||||
],
|
||||
advanced_values: [],
|
||||
advanced_values: [
|
||||
{
|
||||
type: "checkbox",
|
||||
query: "Scroll before scraping:",
|
||||
label: "Scroll before scraping",
|
||||
description:
|
||||
"Enable if the website requires scrolling for the desired content to load",
|
||||
name: "scroll_before_scraping",
|
||||
optional: true,
|
||||
},
|
||||
],
|
||||
overrideDefaultFreq: 60 * 60 * 24,
|
||||
},
|
||||
github: {
|
||||
|
Reference in New Issue
Block a user