From 7f0653d1739c259c929d557d8ed169c91da88f34 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 3 Mar 2025 08:18:44 -0800 Subject: [PATCH] Handling of #! sites (#4169) --- backend/onyx/connectors/web/connector.py | 28 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 494a30fe86..cd62021106 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright +from playwright.sync_api import TimeoutError as PlaywrightTimeoutError from requests_oauthlib import OAuth2Session # type:ignore from urllib3.exceptions import MaxRetryError @@ -293,6 +294,7 @@ class WebConnector(LoadConnector): and converts them into documents""" visited_links: set[str] = set() to_visit: list[str] = self.to_visit_list + content_hashes = set() if not to_visit: raise ValueError("No URLs to visit") @@ -352,11 +354,19 @@ class WebConnector(LoadConnector): continue page = context.new_page() - """wait_until="networkidle" is used to wait for the page to load completely which is necessary - for the javascript heavy websites""" - page_response = page.goto( - initial_url, wait_until="networkidle", timeout=60000 - ) + # wait_until="networkidle" is used to wait for the page to load completely which is necessary + # for the javascript heavy websites + try: + page_response = page.goto( + initial_url, + wait_until="networkidle", + timeout=30000, # 30 seconds + ) + except PlaywrightTimeoutError: + logger.warning( + f"NetworkIdle timeout for {initial_url}, falling back to default load" + ) + page_response = page.goto(initial_url) last_modified = ( page_response.header_value("Last-Modified") if page_response @@ -424,6 +434,14 @@ class WebConnector(LoadConnector): else: parsed_html.cleaned_text += "\n" + document_text + # Sometimes pages with #! will server duplicate content + # There are also just other ways this can happen + hashed_text = hash(parsed_html.cleaned_text) + if hashed_text in content_hashes: + logger.info(f"Skipping duplicate content for {initial_url}") + continue + content_hashes.add(hashed_text) + doc_batch.append( Document( id=initial_url,