Handling of #! sites (#4169)

2025-07-13 22:53:01 +02:00 · 2025-03-03 08:18:44 -08:00
parent e9905a398b
commit 7f0653d173
1 changed files with 23 additions and 5 deletions
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient
 from playwright.sync_api import BrowserContext
 from playwright.sync_api import Playwright
 from playwright.sync_api import sync_playwright
 from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
 from requests_oauthlib import OAuth2Session  # type:ignore
 from urllib3.exceptions import MaxRetryError
@ -293,6 +294,7 @@ class WebConnector(LoadConnector):
        and converts them into documents"""
        visited_links: set[str] = set()
        to_visit: list[str] = self.to_visit_list
        content_hashes = set()
        if not to_visit:
            raise ValueError("No URLs to visit")
@ -352,11 +354,19 @@ class WebConnector(LoadConnector):
                    continue
                page = context.new_page()
-                """wait_until="networkidle" is used to wait for the page to load completely which is necessary
+                # wait_until="networkidle" is used to wait for the page to load completely which is necessary
-                for the javascript heavy websites"""
+                # for the javascript heavy websites
-                page_response = page.goto(
+                try:
-                    initial_url, wait_until="networkidle", timeout=60000
+                    page_response = page.goto(
-                )
+                        initial_url,
                        wait_until="networkidle",
                        timeout=30000,  # 30 seconds
                    )
                except PlaywrightTimeoutError:
                    logger.warning(
                        f"NetworkIdle timeout for {initial_url}, falling back to default load"
                    )
                    page_response = page.goto(initial_url)
                last_modified = (
                    page_response.header_value("Last-Modified")
                    if page_response
@ -424,6 +434,14 @@ class WebConnector(LoadConnector):
                        else:
                            parsed_html.cleaned_text += "\n" + document_text
                # Sometimes pages with #! will server duplicate content
                # There are also just other ways this can happen
                hashed_text = hash(parsed_html.cleaned_text)
                if hashed_text in content_hashes:
                    logger.info(f"Skipping duplicate content for {initial_url}")
                    continue
                content_hashes.add(hashed_text)
                doc_batch.append(
                    Document(
                        id=initial_url,