refactor a mega function for readability and make sure to increment r… (#4542)

* refactor a mega function for readability and make sure to increment retry_count on exception so that we don't infinitely loop * improve session and page level context handling * don't use pydantic for the session context * we don't need retry success * move playwright handling into the session context * need to break on ok * return doc from scrape * fix comment --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-06-07 13:39:50 +02:00 · 2025-04-16 23:43:30 -07:00 · 2025-04-16 23:43:30 -07:00 · 04ebde7838
commit 04ebde7838
parent 6df1c6c72f
1 changed files with 252 additions and 225 deletions
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@ -43,6 +43,48 @@ from shared_configs.configs import MULTI_TENANT
 logger = setup_logger()
 class ScrapeSessionContext:
    """Session level context for scraping"""
    def __init__(self, base_url: str, to_visit: list[str]):
        self.base_url = base_url
        self.to_visit = to_visit
    def initialize(self) -> None:
        self.stop()
        self.playwright, self.playwright_context = start_playwright()
    def stop(self) -> None:
        if self.playwright_context:
            self.playwright_context.close()
            self.playwright_context = None
        if self.playwright:
            self.playwright.stop()
            self.playwright = None
    base_url: str
    to_visit: list[str]
    playwright: Playwright | None = None
    playwright_context: BrowserContext | None = None
    visited_links: set[str] = set()
    content_hashes: set[int] = set()
    doc_batch: list[Document] = []
    at_least_one_doc: bool = False
    last_error: str | None = None
    needs_retry: bool = False
 class ScrapeResult:
    doc: Document | None = None
    retry: bool = False
 WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
 # Threshold for determining when to replace vs append iframe content
 IFRAME_TEXT_LENGTH_THRESHOLD = 700
@ -394,6 +436,8 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
 class WebConnector(LoadConnector):
    MAX_RETRIES = 3
    def __init__(
        self,
        base_url: str,  # Can't change this without disrupting existing users
@ -444,65 +488,24 @@ class WebConnector(LoadConnector):
            logger.warning("Unexpected credentials provided for Web Connector")
        return None
-    def load_from_state(self) -> GenerateDocumentsOutput:
+    def _do_scrape(
-        """Traverses through all pages found on the website
+        self,
-        and converts them into documents"""
+        index: int,
-        visited_links: set[str] = set()
+        initial_url: str,
-        to_visit: list[str] = self.to_visit_list
+        session_ctx: ScrapeSessionContext,
-        content_hashes = set()
+    ) -> ScrapeResult:
        """Returns a ScrapeResult object with a doc and retry flag."""
-        if not to_visit:
+        if session_ctx.playwright is None:
-            raise ValueError("No URLs to visit")
+            raise RuntimeError("scrape_context.playwright is None")
-        base_url = to_visit[0]  # For the recursive case
+        if session_ctx.playwright_context is None:
-        doc_batch: list[Document] = []
+            raise RuntimeError("scrape_context.playwright_context is None")
-        # make sure we can connect to the base url
+        result = ScrapeResult()
        check_internet_connection(base_url)
        # Needed to report error
        at_least_one_doc = False
        last_error = None
        playwright, context = start_playwright()
        restart_playwright = False
        while to_visit:
            initial_url = to_visit.pop()
            if initial_url in visited_links:
                continue
            visited_links.add(initial_url)
            try:
                protected_url_check(initial_url)
            except Exception as e:
                last_error = f"Invalid URL {initial_url} due to {e}"
                logger.warning(last_error)
                continue
            index = len(visited_links)
            logger.info(f"{index}: Visiting {initial_url}")
            # Add retry mechanism with exponential backoff
            max_retries = 3
            retry_count = 0
            retry_success = False
            while retry_count < max_retries and not retry_success:
                try:
                    if retry_count > 0:
                        # Add a random delay between retries (exponential backoff)
                        delay = min(2**retry_count + random.uniform(0, 1), 10)
                        logger.info(
                            f"Retry {retry_count}/{max_retries} for {initial_url} after {delay:.2f}s delay"
                        )
                        time.sleep(delay)
                    if restart_playwright:
                        playwright, context = start_playwright()
                        restart_playwright = False
        # Handle cookies for the URL
-                    _handle_cookies(context, initial_url)
+        _handle_cookies(session_ctx.playwright_context, initial_url)
        # First do a HEAD request to check content type without downloading the entire content
        head_response = requests.head(
@ -518,34 +521,26 @@ class WebConnector(LoadConnector):
            )
            last_modified = response.headers.get("Last-Modified")
-                        doc_batch.append(
+            result.doc = Document(
                            Document(
                id=initial_url,
-                                sections=[
+                sections=[TextSection(link=initial_url, text=page_text)],
                                    TextSection(link=initial_url, text=page_text)
                                ],
                source=DocumentSource.WEB,
                semantic_identifier=initial_url.split("/")[-1],
                metadata=metadata,
                doc_updated_at=(
-                                    _get_datetime_from_last_modified_header(
+                    _get_datetime_from_last_modified_header(last_modified)
                                        last_modified
                                    )
                    if last_modified
                    else None
                ),
            )
                        )
                        retry_success = True
                        continue
-                    page = context.new_page()
+            return result
        page = session_ctx.playwright_context.new_page()
        try:
            if self.add_randomness:
                # Add random mouse movements and scrolling to mimic human behavior
-                        page.mouse.move(
+                page.mouse.move(random.randint(100, 700), random.randint(100, 500))
                            random.randint(100, 700), random.randint(100, 500)
                        )
            # Can't use wait_until="networkidle" because it interferes with the scrolling behavior
            page_response = page.goto(
@ -557,44 +552,29 @@ class WebConnector(LoadConnector):
            # Add a small random delay to mimic human behavior
            time.sleep(random.uniform(0.5, 2.0))
                    # Check if we got a 403 error
                    if page_response and page_response.status == 403:
                        logger.warning(
                            f"Received 403 Forbidden for {initial_url}, retrying..."
                        )
                        page.close()
                        retry_count += 1
                        continue
            last_modified = (
-                        page_response.header_value("Last-Modified")
+                page_response.header_value("Last-Modified") if page_response else None
                        if page_response
                        else None
            )
            final_url = page.url
            if final_url != initial_url:
                protected_url_check(final_url)
                initial_url = final_url
-                        if initial_url in visited_links:
+                if initial_url in session_ctx.visited_links:
                    logger.info(
                        f"{index}: {initial_url} redirected to {final_url} - already indexed"
                    )
                    page.close()
-                            retry_success = True
+                    return result
-                            continue
+
                logger.info(f"{index}: {initial_url} redirected to {final_url}")
-                        visited_links.add(initial_url)
+                session_ctx.visited_links.add(initial_url)
            # If we got here, the request was successful
                    retry_success = True
            if self.scroll_before_scraping:
                scroll_attempts = 0
                previous_height = page.evaluate("document.body.scrollHeight")
                while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
-                            page.evaluate(
+                    page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                                "window.scrollTo(0, document.body.scrollHeight)"
                            )
                    page.wait_for_load_state("networkidle", timeout=30000)
                    new_height = page.evaluate("document.body.scrollHeight")
                    if new_height == previous_height:
@ -606,16 +586,20 @@ class WebConnector(LoadConnector):
            soup = BeautifulSoup(content, "html.parser")
            if self.recursive:
-                        internal_links = get_internal_links(base_url, initial_url, soup)
+                internal_links = get_internal_links(
                    session_ctx.base_url, initial_url, soup
                )
                for link in internal_links:
-                            if link not in visited_links:
+                    if link not in session_ctx.visited_links:
-                                to_visit.append(link)
+                        session_ctx.to_visit.append(link)
            if page_response and str(page_response.status)[0] in ("4", "5"):
-                        last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response"
+                session_ctx.last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response"
-                        logger.info(last_error)
+                logger.info(session_ctx.last_error)
-                        continue
+                result.retry = True
                return result
            # after this point, we don't need the caller to retry
            parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
            """For websites containing iframes that need to be scraped,
@ -625,22 +609,15 @@ class WebConnector(LoadConnector):
                f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
            )
            if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
-                        iframe_count = (
+                iframe_count = page.frame_locator("iframe").locator("html").count()
                            page.frame_locator("iframe").locator("html").count()
                        )
                if iframe_count > 0:
                    iframe_texts = (
-                                page.frame_locator("iframe")
+                        page.frame_locator("iframe").locator("html").all_inner_texts()
                                .locator("html")
                                .all_inner_texts()
                    )
                    document_text = "\n".join(iframe_texts)
                    """ 700 is the threshold value for the length of the text extracted
                    from the iframe based on the issue faced """
-                            if (
+                    if len(parsed_html.cleaned_text) < IFRAME_TEXT_LENGTH_THRESHOLD:
                                len(parsed_html.cleaned_text)
                                < IFRAME_TEXT_LENGTH_THRESHOLD
                            ):
                        parsed_html.cleaned_text = document_text
                    else:
                        parsed_html.cleaned_text += "\n" + document_text
@ -648,21 +625,17 @@ class WebConnector(LoadConnector):
            # Sometimes pages with #! will serve duplicate content
            # There are also just other ways this can happen
            hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
-                    if hashed_text in content_hashes:
+            if hashed_text in session_ctx.content_hashes:
                logger.info(
                    f"{index}: Skipping duplicate title + content for {initial_url}"
                )
-                        continue
+                return result
                    content_hashes.add(hashed_text)
-                    doc_batch.append(
+            session_ctx.content_hashes.add(hashed_text)
-                        Document(
+
            result.doc = Document(
                id=initial_url,
-                            sections=[
+                sections=[TextSection(link=initial_url, text=parsed_html.cleaned_text)],
                                TextSection(
                                    link=initial_url, text=parsed_html.cleaned_text
                                )
                            ],
                source=DocumentSource.WEB,
                semantic_identifier=parsed_html.title or initial_url,
                metadata={},
@ -672,33 +645,87 @@ class WebConnector(LoadConnector):
                    else None
                ),
            )
-                    )
+        finally:
            page.close()
        return result
    def load_from_state(self) -> GenerateDocumentsOutput:
        """Traverses through all pages found on the website
        and converts them into documents"""
        if not self.to_visit_list:
            raise ValueError("No URLs to visit")
        base_url = self.to_visit_list[0]  # For the recursive case
        check_internet_connection(base_url)  # make sure we can connect to the base url
        session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
        session_ctx.initialize()
        while session_ctx.to_visit:
            initial_url = session_ctx.to_visit.pop()
            if initial_url in session_ctx.visited_links:
                continue
            session_ctx.visited_links.add(initial_url)
            try:
                protected_url_check(initial_url)
            except Exception as e:
-                    last_error = f"Failed to fetch '{initial_url}': {e}"
+                session_ctx.last_error = f"Invalid URL {initial_url} due to {e}"
-                    logger.exception(last_error)
+                logger.warning(session_ctx.last_error)
                    playwright.stop()
                    restart_playwright = True
                continue
-            if len(doc_batch) >= self.batch_size:
+            index = len(session_ctx.visited_links)
-                playwright.stop()
+            logger.info(f"{index}: Visiting {initial_url}")
                restart_playwright = True
                at_least_one_doc = True
                yield doc_batch
                doc_batch = []
-        if doc_batch:
+            # Add retry mechanism with exponential backoff
-            playwright.stop()
+            retry_count = 0
            at_least_one_doc = True
            yield doc_batch
-        if not at_least_one_doc:
+            while retry_count < self.MAX_RETRIES:
-            if last_error:
+                if retry_count > 0:
-                raise RuntimeError(last_error)
+                    # Add a random delay between retries (exponential backoff)
                    delay = min(2**retry_count + random.uniform(0, 1), 10)
                    logger.info(
                        f"Retry {retry_count}/{self.MAX_RETRIES} for {initial_url} after {delay:.2f}s delay"
                    )
                    time.sleep(delay)
                try:
                    result = self._do_scrape(index, initial_url, session_ctx)
                    if result.retry:
                        continue
                    if result.doc:
                        session_ctx.doc_batch.append(result.doc)
                except Exception as e:
                    session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
                    logger.exception(session_ctx.last_error)
                    session_ctx.initialize()
                    continue
                finally:
                    retry_count += 1
                break  # success / don't retry
            if len(session_ctx.doc_batch) >= self.batch_size:
                session_ctx.initialize()
                session_ctx.at_least_one_doc = True
                yield session_ctx.doc_batch
                session_ctx.doc_batch = []
        if session_ctx.doc_batch:
            session_ctx.stop()
            session_ctx.at_least_one_doc = True
            yield session_ctx.doc_batch
        if not session_ctx.at_least_one_doc:
            if session_ctx.last_error:
                raise RuntimeError(session_ctx.last_error)
            raise RuntimeError("No valid pages found.")
        session_ctx.stop()
    def validate_connector_settings(self) -> None:
        # Make sure we have at least one valid URL to check
        if not self.to_visit_list: