mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
support scrolling before scraping (#4040)
* support scrolling before scraping * fix mypy * install playwright deps --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
@@ -74,7 +74,9 @@ jobs:
|
|||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||||
|
playwright install chromium
|
||||||
|
playwright install-deps chromium
|
||||||
|
|
||||||
- name: Run Tests
|
- name: Run Tests
|
||||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||||
run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
|
run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
|
||||||
|
@@ -37,6 +37,8 @@ from shared_configs.configs import MULTI_TENANT
|
|||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
||||||
|
|
||||||
|
|
||||||
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
||||||
# Given a base site, index everything under that path
|
# Given a base site, index everything under that path
|
||||||
@@ -225,10 +227,13 @@ class WebConnector(LoadConnector):
|
|||||||
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
||||||
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
scroll_before_scraping: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.mintlify_cleanup = mintlify_cleanup
|
self.mintlify_cleanup = mintlify_cleanup
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.recursive = False
|
self.recursive = False
|
||||||
|
self.scroll_before_scraping = scroll_before_scraping
|
||||||
|
|
||||||
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
||||||
self.recursive = True
|
self.recursive = True
|
||||||
@@ -344,6 +349,18 @@ class WebConnector(LoadConnector):
|
|||||||
continue
|
continue
|
||||||
visited_links.add(current_url)
|
visited_links.add(current_url)
|
||||||
|
|
||||||
|
if self.scroll_before_scraping:
|
||||||
|
scroll_attempts = 0
|
||||||
|
previous_height = page.evaluate("document.body.scrollHeight")
|
||||||
|
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||||
|
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
page.wait_for_load_state("networkidle", timeout=30000)
|
||||||
|
new_height = page.evaluate("document.body.scrollHeight")
|
||||||
|
if new_height == previous_height:
|
||||||
|
break # Stop scrolling when no more content is loaded
|
||||||
|
previous_height = new_height
|
||||||
|
scroll_attempts += 1
|
||||||
|
|
||||||
content = page.content()
|
content = page.content()
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from onyx.connectors.models import Document
|
||||||
|
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
|
||||||
|
from onyx.connectors.web.connector import WebConnector
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
|
||||||
|
# to avoid depending on a third party site
|
||||||
|
@pytest.fixture
|
||||||
|
def web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
||||||
|
scroll_before_scraping = request.param
|
||||||
|
connector = WebConnector(
|
||||||
|
base_url="https://developer.onewelcome.com",
|
||||||
|
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||||
|
scroll_before_scraping=scroll_before_scraping,
|
||||||
|
)
|
||||||
|
return connector
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("web_connector", [True], indirect=True)
|
||||||
|
def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||||
|
all_docs: list[Document] = []
|
||||||
|
document_batches = web_connector.load_from_state()
|
||||||
|
for doc_batch in document_batches:
|
||||||
|
for doc in doc_batch:
|
||||||
|
all_docs.append(doc)
|
||||||
|
|
||||||
|
assert len(all_docs) == 1
|
||||||
|
doc = all_docs[0]
|
||||||
|
assert "Onegini Identity Cloud" in doc.sections[0].text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("web_connector", [False], indirect=True)
|
||||||
|
def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||||
|
all_docs: list[Document] = []
|
||||||
|
document_batches = web_connector.load_from_state()
|
||||||
|
for doc_batch in document_batches:
|
||||||
|
for doc in doc_batch:
|
||||||
|
all_docs.append(doc)
|
||||||
|
|
||||||
|
assert len(all_docs) == 1
|
||||||
|
doc = all_docs[0]
|
||||||
|
assert "Onegini Identity Cloud" not in doc.sections[0].text
|
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
advanced_values: [],
|
advanced_values: [
|
||||||
|
{
|
||||||
|
type: "checkbox",
|
||||||
|
query: "Scroll before scraping:",
|
||||||
|
label: "Scroll before scraping",
|
||||||
|
description:
|
||||||
|
"Enable if the website requires scrolling for the desired content to load",
|
||||||
|
name: "scroll_before_scraping",
|
||||||
|
optional: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
overrideDefaultFreq: 60 * 60 * 24,
|
overrideDefaultFreq: 60 * 60 * 24,
|
||||||
},
|
},
|
||||||
github: {
|
github: {
|
||||||
|
Reference in New Issue
Block a user