diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index ff19d9de3f3..ec850ebef5a 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -50,6 +50,17 @@ class ScrapeSessionContext: def __init__(self, base_url: str, to_visit: list[str]): self.base_url = base_url self.to_visit = to_visit + self.visited_links: set[str] = set() + self.content_hashes: set[int] = set() + + self.doc_batch: list[Document] = [] + + self.at_least_one_doc: bool = False + self.last_error: str | None = None + self.needs_retry: bool = False + + self.playwright: Playwright | None = None + self.playwright_context: BrowserContext | None = None def initialize(self) -> None: self.stop() @@ -64,21 +75,6 @@ class ScrapeSessionContext: self.playwright.stop() self.playwright = None - base_url: str - to_visit: list[str] - playwright: Playwright | None = None - playwright_context: BrowserContext | None = None - - visited_links: set[str] = set() - content_hashes: set[int] = set() - - doc_batch: list[Document] = [] - - at_least_one_doc: bool = False - last_error: str | None = None - - needs_retry: bool = False - class ScrapeResult: doc: Document | None = None @@ -177,9 +173,6 @@ def check_internet_connection(url: str) -> None: session = requests.Session() session.headers.update(DEFAULT_HEADERS) - # Add a random delay to mimic human behavior - time.sleep(random.uniform(0.1, 0.5)) - response = session.get(url, timeout=5, allow_redirects=True) response.raise_for_status() @@ -445,7 +438,6 @@ class WebConnector(LoadConnector): mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well batch_size: int = INDEX_BATCH_SIZE, scroll_before_scraping: bool = False, - add_randomness: bool = True, **kwargs: Any, ) -> None: self.mintlify_cleanup = mintlify_cleanup @@ -453,7 +445,6 @@ class WebConnector(LoadConnector): self.recursive = False self.scroll_before_scraping = scroll_before_scraping self.web_connector_type = web_connector_type - self.add_randomness = add_randomness if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: self.recursive = True self.to_visit_list = [_ensure_valid_url(base_url)] @@ -538,10 +529,6 @@ class WebConnector(LoadConnector): page = session_ctx.playwright_context.new_page() try: - if self.add_randomness: - # Add random mouse movements and scrolling to mimic human behavior - page.mouse.move(random.randint(100, 700), random.randint(100, 500)) - # Can't use wait_until="networkidle" because it interferes with the scrolling behavior page_response = page.goto( initial_url, @@ -549,9 +536,6 @@ class WebConnector(LoadConnector): wait_until="domcontentloaded", # Wait for DOM to be ready ) - # Add a small random delay to mimic human behavior - time.sleep(random.uniform(0.5, 2.0)) - last_modified = ( page_response.header_value("Last-Modified") if page_response else None ) @@ -575,7 +559,10 @@ class WebConnector(LoadConnector): previous_height = page.evaluate("document.body.scrollHeight") while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + # wait for the content to load if we scrolled page.wait_for_load_state("networkidle", timeout=30000) + time.sleep(0.5) # let javascript run + new_height = page.evaluate("document.body.scrollHeight") if new_height == previous_height: break # Stop scrolling when no more content is loaded diff --git a/backend/tests/daily/connectors/web/test_web_connector.py b/backend/tests/daily/connectors/web/test_web_connector.py index e7288ca18d8..7c987013470 100644 --- a/backend/tests/daily/connectors/web/test_web_connector.py +++ b/backend/tests/daily/connectors/web/test_web_connector.py @@ -12,21 +12,20 @@ EXPECTED_QUOTE = ( # NOTE(rkuo): we will probably need to adjust this test to point at our own test site # to avoid depending on a third party site @pytest.fixture -def web_connector(request: pytest.FixtureRequest) -> WebConnector: +def quotes_to_scroll_web_connector(request: pytest.FixtureRequest) -> WebConnector: scroll_before_scraping = request.param connector = WebConnector( base_url="https://quotes.toscrape.com/scroll", web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value, scroll_before_scraping=scroll_before_scraping, - add_randomness=False, ) return connector -@pytest.mark.parametrize("web_connector", [True], indirect=True) -def test_web_connector_scroll(web_connector: WebConnector) -> None: +@pytest.mark.parametrize("quotes_to_scroll_web_connector", [True], indirect=True) +def test_web_connector_scroll(quotes_to_scroll_web_connector: WebConnector) -> None: all_docs: list[Document] = [] - document_batches = web_connector.load_from_state() + document_batches = quotes_to_scroll_web_connector.load_from_state() for doc_batch in document_batches: for doc in doc_batch: all_docs.append(doc) @@ -37,10 +36,10 @@ def test_web_connector_scroll(web_connector: WebConnector) -> None: assert EXPECTED_QUOTE in doc.sections[0].text -@pytest.mark.parametrize("web_connector", [False], indirect=True) -def test_web_connector_no_scroll(web_connector: WebConnector) -> None: +@pytest.mark.parametrize("quotes_to_scroll_web_connector", [False], indirect=True) +def test_web_connector_no_scroll(quotes_to_scroll_web_connector: WebConnector) -> None: all_docs: list[Document] = [] - document_batches = web_connector.load_from_state() + document_batches = quotes_to_scroll_web_connector.load_from_state() for doc_batch in document_batches: for doc in doc_batch: all_docs.append(doc) @@ -49,3 +48,20 @@ def test_web_connector_no_scroll(web_connector: WebConnector) -> None: doc = all_docs[0] assert doc.sections[0].text is not None assert EXPECTED_QUOTE not in doc.sections[0].text + + +MERCURY_EXPECTED_QUOTE = "How can we help?" + + +def test_web_connector_bot_protection() -> None: + connector = WebConnector( + base_url="https://support.mercury.com/hc", + web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value, + ) + document_batches = list(connector.load_from_state()) + assert len(document_batches) == 1 + doc_batch = document_batches[0] + assert len(doc_batch) == 1 + doc = doc_batch[0] + assert doc.sections[0].text is not None + assert MERCURY_EXPECTED_QUOTE in doc.sections[0].text