Handling of #! sites (#4169)

2025-07-12 14:12:53 +02:00 · 2025-03-03 08:18:44 -08:00
parent e9905a398b
commit 7f0653d173
1 changed files with 23 additions and 5 deletions
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient
 from playwright.sync_api import BrowserContext
 from playwright.sync_api import Playwright
 from playwright.sync_api import sync_playwright
+from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
 from requests_oauthlib import OAuth2Session  # type:ignore
 from urllib3.exceptions import MaxRetryError

@ -293,6 +294,7 @@ class WebConnector(LoadConnector):
        and converts them into documents"""
        visited_links: set[str] = set()
        to_visit: list[str] = self.to_visit_list
+        content_hashes = set()

        if not to_visit:
            raise ValueError("No URLs to visit")
@ -352,11 +354,19 @@ class WebConnector(LoadConnector):
                    continue

                page = context.new_page()
-                """wait_until="networkidle" is used to wait for the page to load completely which is necessary
-                for the javascript heavy websites"""
-                page_response = page.goto(
-                    initial_url, wait_until="networkidle", timeout=60000
-                )
+                # wait_until="networkidle" is used to wait for the page to load completely which is necessary
+                # for the javascript heavy websites
+                try:
+                    page_response = page.goto(
+                        initial_url,
+                        wait_until="networkidle",
+                        timeout=30000,  # 30 seconds
+                    )
+                except PlaywrightTimeoutError:
+                    logger.warning(
+                        f"NetworkIdle timeout for {initial_url}, falling back to default load"
+                    )
+                    page_response = page.goto(initial_url)
                last_modified = (
                    page_response.header_value("Last-Modified")
                    if page_response
@ -424,6 +434,14 @@ class WebConnector(LoadConnector):
                        else:
                            parsed_html.cleaned_text += "\n" + document_text

+                # Sometimes pages with #! will server duplicate content
+                # There are also just other ways this can happen
+                hashed_text = hash(parsed_html.cleaned_text)
+                if hashed_text in content_hashes:
+                    logger.info(f"Skipping duplicate content for {initial_url}")
+                    continue
+                content_hashes.add(hashed_text)
+
                doc_batch.append(
                    Document(
                        id=initial_url,