From 39fd6919adf8fed51699d3008b4db6b939cf51e2 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 3 Mar 2025 09:00:05 -0800 Subject: [PATCH] Fix web scrolling --- backend/onyx/connectors/web/connector.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index cd620211062d..2f7f656cea17 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -16,7 +16,6 @@ from oauthlib.oauth2 import BackendApplicationClient from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright -from playwright.sync_api import TimeoutError as PlaywrightTimeoutError from requests_oauthlib import OAuth2Session # type:ignore from urllib3.exceptions import MaxRetryError @@ -354,19 +353,13 @@ class WebConnector(LoadConnector): continue page = context.new_page() - # wait_until="networkidle" is used to wait for the page to load completely which is necessary - # for the javascript heavy websites - try: - page_response = page.goto( - initial_url, - wait_until="networkidle", - timeout=30000, # 30 seconds - ) - except PlaywrightTimeoutError: - logger.warning( - f"NetworkIdle timeout for {initial_url}, falling back to default load" - ) - page_response = page.goto(initial_url) + + # Can't use wait_until="networkidle" because it interferes with the scrolling behavior + page_response = page.goto( + initial_url, + timeout=30000, # 30 seconds + ) + last_modified = ( page_response.header_value("Last-Modified") if page_response