From 7f0653d1739c259c929d557d8ed169c91da88f34 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Mon, 3 Mar 2025 08:18:44 -0800
Subject: [PATCH] Handling of #! sites (#4169)

---
 backend/onyx/connectors/web/connector.py | 28 +++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py
index 494a30fe86..cd62021106 100644
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient
 from playwright.sync_api import BrowserContext
 from playwright.sync_api import Playwright
 from playwright.sync_api import sync_playwright
+from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
 from requests_oauthlib import OAuth2Session  # type:ignore
 from urllib3.exceptions import MaxRetryError
 
@@ -293,6 +294,7 @@ class WebConnector(LoadConnector):
         and converts them into documents"""
         visited_links: set[str] = set()
         to_visit: list[str] = self.to_visit_list
+        content_hashes = set()
 
         if not to_visit:
             raise ValueError("No URLs to visit")
@@ -352,11 +354,19 @@ class WebConnector(LoadConnector):
                     continue
 
                 page = context.new_page()
-                """wait_until="networkidle" is used to wait for the page to load completely which is necessary
-                for the javascript heavy websites"""
-                page_response = page.goto(
-                    initial_url, wait_until="networkidle", timeout=60000
-                )
+                # wait_until="networkidle" is used to wait for the page to load completely which is necessary
+                # for the javascript heavy websites
+                try:
+                    page_response = page.goto(
+                        initial_url,
+                        wait_until="networkidle",
+                        timeout=30000,  # 30 seconds
+                    )
+                except PlaywrightTimeoutError:
+                    logger.warning(
+                        f"NetworkIdle timeout for {initial_url}, falling back to default load"
+                    )
+                    page_response = page.goto(initial_url)
                 last_modified = (
                     page_response.header_value("Last-Modified")
                     if page_response
@@ -424,6 +434,14 @@ class WebConnector(LoadConnector):
                         else:
                             parsed_html.cleaned_text += "\n" + document_text
 
+                # Sometimes pages with #! will server duplicate content
+                # There are also just other ways this can happen
+                hashed_text = hash(parsed_html.cleaned_text)
+                if hashed_text in content_hashes:
+                    logger.info(f"Skipping duplicate content for {initial_url}")
+                    continue
+                content_hashes.add(hashed_text)
+
                 doc_batch.append(
                     Document(
                         id=initial_url,