From e9905a398bf83b2810af96621d4f02f19aa20e2e Mon Sep 17 00:00:00 2001
From: SubashMohan <76524044+Subash-Mohan@users.noreply.github.com>
Date: Mon, 3 Mar 2025 08:59:10 +0530
Subject: [PATCH] Enhance iframe content extraction and add thresholds for
 JavaScript disabled scenarios (#4167)

---
 backend/onyx/connectors/web/connector.py | 33 ++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py
index 922563374..494a30fe8 100644
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -42,6 +42,10 @@ from shared_configs.configs import MULTI_TENANT
 logger = setup_logger()
 
 WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
+# Threshold for determining when to replace vs append iframe content
+IFRAME_TEXT_LENGTH_THRESHOLD = 700
+# Message indicating JavaScript is disabled, which often appears when scraping fails
+JAVASCRIPT_DISABLED_MESSAGE = "You have JavaScript disabled in your browser"
 
 
 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
@@ -138,7 +142,8 @@ def get_internal_links(
         # Account for malformed backslashes in URLs
         href = href.replace("\\", "/")
 
-        if should_ignore_pound and "#" in href:
+        # "#!" indicates the page is using a hashbang URL, which is a client-side routing technique
+        if should_ignore_pound and "#" in href and "#!" not in href:
             href = href.split("#")[0]
 
         if not is_valid_url(href):
@@ -347,7 +352,11 @@ class WebConnector(LoadConnector):
                     continue
 
                 page = context.new_page()
-                page_response = page.goto(initial_url)
+                """wait_until="networkidle" is used to wait for the page to load completely which is necessary
+                for the javascript heavy websites"""
+                page_response = page.goto(
+                    initial_url, wait_until="networkidle", timeout=60000
+                )
                 last_modified = (
                     page_response.header_value("Last-Modified")
                     if page_response
@@ -395,6 +404,26 @@ class WebConnector(LoadConnector):
 
                 parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
 
+                """For websites containing iframes that need to be scraped,
+                the code below can extract text from within these iframes.
+                """
+                logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}")
+                if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
+                    iframe_count = page.frame_locator("iframe").locator("html").count()
+                    if iframe_count > 0:
+                        iframe_texts = (
+                            page.frame_locator("iframe")
+                            .locator("html")
+                            .all_inner_texts()
+                        )
+                        document_text = "\n".join(iframe_texts)
+                        """ 700 is the threshold value for the length of the text extracted
+                        from the iframe based on the issue faced """
+                        if len(parsed_html.cleaned_text) < IFRAME_TEXT_LENGTH_THRESHOLD:
+                            parsed_html.cleaned_text = document_text
+                        else:
+                            parsed_html.cleaned_text += "\n" + document_text
+
                 doc_batch.append(
                     Document(
                         id=initial_url,