From 71839e723f47029ec085355e27958a3404cad148 Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Tue, 8 Apr 2025 12:31:30 -0700 Subject: [PATCH] Add stuff to better avoid bot-detection in web connector (#4479) * Add stuff to better avoid bot-detection in web connector * Switch to exception log --- backend/onyx/connectors/web/connector.py | 406 +++++++++++++++-------- 1 file changed, 275 insertions(+), 131 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 5d1f8baea252..0bd8e2119dae 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -1,6 +1,8 @@ import io import ipaddress +import random import socket +import time from datetime import datetime from datetime import timezone from enum import Enum @@ -129,11 +131,29 @@ def protected_url_check(url: str) -> None: def check_internet_connection(url: str) -> None: try: - response = requests.get(url, timeout=3, headers=DEFAULT_HEADERS) + # Use a more realistic browser-like request + session = requests.Session() + session.headers.update(DEFAULT_HEADERS) + + # Add a random delay to mimic human behavior + time.sleep(random.uniform(0.1, 0.5)) + + response = session.get(url, timeout=5, allow_redirects=True) + response.raise_for_status() except requests.exceptions.HTTPError as e: # Extract status code from the response, defaulting to -1 if response is None status_code = e.response.status_code if e.response is not None else -1 + + # For 403 errors, we do have internet connection, but the request is blocked by the server + # this is usually due to bot detection. Future calls (via Playwright) will usually get + # around this. + if status_code == 403: + logger.warning( + f"Received 403 Forbidden for {url}, will retry with browser automation" + ) + return + error_msg = { 400: "Bad Request", 401: "Unauthorized", @@ -198,7 +218,15 @@ def is_pdf_content(response: requests.Response) -> bool: def start_playwright() -> Tuple[Playwright, BrowserContext]: playwright = sync_playwright().start() - browser = playwright.chromium.launch(headless=True) + # Launch browser with more realistic settings + browser = playwright.chromium.launch( + headless=True, + args=[ + "--disable-blink-features=AutomationControlled", + "--disable-features=IsolateOrigins,site-per-process", + "--disable-site-isolation-trials", + ], + ) # Create a context with realistic browser properties context = browser.new_context( @@ -210,6 +238,9 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]: has_touch=False, java_script_enabled=True, color_scheme="light", + # Add more realistic browser properties + bypass_csp=True, + ignore_https_errors=True, ) # Set additional headers to mimic a real browser @@ -221,9 +252,29 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]: "Sec-Fetch-Mode": DEFAULT_HEADERS["Sec-Fetch-Mode"], "Sec-Fetch-Site": DEFAULT_HEADERS["Sec-Fetch-Site"], "Sec-Fetch-User": DEFAULT_HEADERS["Sec-Fetch-User"], + "Sec-CH-UA": DEFAULT_HEADERS["Sec-CH-UA"], + "Sec-CH-UA-Mobile": DEFAULT_HEADERS["Sec-CH-UA-Mobile"], + "Sec-CH-UA-Platform": DEFAULT_HEADERS["Sec-CH-UA-Platform"], + "Cache-Control": "max-age=0", + "DNT": "1", } ) + # Add a script to modify navigator properties to avoid detection + context.add_init_script( + """ + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + """ + ) + if ( WEB_CONNECTOR_OAUTH_CLIENT_ID and WEB_CONNECTOR_OAUTH_CLIENT_SECRET @@ -301,6 +352,47 @@ def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | No return None +def _handle_cookies(context: BrowserContext, url: str) -> None: + """Handle cookies for the given URL to help with bot detection""" + try: + # Parse the URL to get the domain + parsed_url = urlparse(url) + domain = parsed_url.netloc + + # Add some common cookies that might help with bot detection + cookies: list[dict[str, str]] = [ + { + "name": "cookieconsent", + "value": "accepted", + "domain": domain, + "path": "/", + }, + { + "name": "consent", + "value": "true", + "domain": domain, + "path": "/", + }, + { + "name": "session", + "value": "random_session_id", + "domain": domain, + "path": "/", + }, + ] + + # Add cookies to the context + for cookie in cookies: + try: + context.add_cookies([cookie]) # type: ignore + except Exception as e: + logger.debug(f"Failed to add cookie {cookie['name']} for {domain}: {e}") + except Exception: + logger.exception( + f"Unexpected error while handling cookies for Web Connector with URL {url}" + ) + + class WebConnector(LoadConnector): def __init__( self, @@ -364,6 +456,9 @@ class WebConnector(LoadConnector): base_url = to_visit[0] # For the recursive case doc_batch: list[Document] = [] + # make sure we can connect to the base url + check_internet_connection(base_url) + # Needed to report error at_least_one_doc = False last_error = None @@ -386,33 +481,185 @@ class WebConnector(LoadConnector): index = len(visited_links) logger.info(f"{index}: Visiting {initial_url}") - try: - check_internet_connection(initial_url) - if restart_playwright: - playwright, context = start_playwright() - restart_playwright = False + # Add retry mechanism with exponential backoff + max_retries = 3 + retry_count = 0 + retry_success = False - # First do a HEAD request to check content type without downloading the entire content - head_response = requests.head( - initial_url, headers=DEFAULT_HEADERS, allow_redirects=True - ) - is_pdf = is_pdf_content(head_response) + while retry_count < max_retries and not retry_success: + try: + if retry_count > 0: + # Add a random delay between retries (exponential backoff) + delay = min(2**retry_count + random.uniform(0, 1), 10) + logger.info( + f"Retry {retry_count}/{max_retries} for {initial_url} after {delay:.2f}s delay" + ) + time.sleep(delay) - if is_pdf or initial_url.lower().endswith(".pdf"): - # PDF files are not checked for links - response = requests.get(initial_url, headers=DEFAULT_HEADERS) - page_text, metadata, images = read_pdf_file( - file=io.BytesIO(response.content) + if restart_playwright: + playwright, context = start_playwright() + restart_playwright = False + + # Handle cookies for the URL + _handle_cookies(context, initial_url) + + # First do a HEAD request to check content type without downloading the entire content + head_response = requests.head( + initial_url, headers=DEFAULT_HEADERS, allow_redirects=True ) - last_modified = response.headers.get("Last-Modified") + is_pdf = is_pdf_content(head_response) + + if is_pdf or initial_url.lower().endswith(".pdf"): + # PDF files are not checked for links + response = requests.get(initial_url, headers=DEFAULT_HEADERS) + page_text, metadata, images = read_pdf_file( + file=io.BytesIO(response.content) + ) + last_modified = response.headers.get("Last-Modified") + + doc_batch.append( + Document( + id=initial_url, + sections=[ + TextSection(link=initial_url, text=page_text) + ], + source=DocumentSource.WEB, + semantic_identifier=initial_url.split("/")[-1], + metadata=metadata, + doc_updated_at=_get_datetime_from_last_modified_header( + last_modified + ) + if last_modified + else None, + ) + ) + retry_success = True + continue + + page = context.new_page() + + # Add random mouse movements and scrolling to mimic human behavior + page.mouse.move(random.randint(100, 700), random.randint(100, 500)) + + # Can't use wait_until="networkidle" because it interferes with the scrolling behavior + page_response = page.goto( + initial_url, + timeout=30000, # 30 seconds + wait_until="domcontentloaded", # Wait for DOM to be ready + ) + + # Add a small random delay to mimic human behavior + time.sleep(random.uniform(0.5, 2.0)) + + # Check if we got a 403 error + if page_response and page_response.status == 403: + logger.warning( + f"Received 403 Forbidden for {initial_url}, retrying..." + ) + page.close() + retry_count += 1 + continue + + last_modified = ( + page_response.header_value("Last-Modified") + if page_response + else None + ) + final_url = page.url + if final_url != initial_url: + protected_url_check(final_url) + initial_url = final_url + if initial_url in visited_links: + logger.info( + f"{index}: {initial_url} redirected to {final_url} - already indexed" + ) + page.close() + retry_success = True + continue + logger.info(f"{index}: {initial_url} redirected to {final_url}") + visited_links.add(initial_url) + + # If we got here, the request was successful + retry_success = True + + if self.scroll_before_scraping: + scroll_attempts = 0 + previous_height = page.evaluate("document.body.scrollHeight") + while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: + page.evaluate( + "window.scrollTo(0, document.body.scrollHeight)" + ) + page.wait_for_load_state("networkidle", timeout=30000) + new_height = page.evaluate("document.body.scrollHeight") + if new_height == previous_height: + break # Stop scrolling when no more content is loaded + previous_height = new_height + scroll_attempts += 1 + + content = page.content() + soup = BeautifulSoup(content, "html.parser") + + if self.recursive: + internal_links = get_internal_links(base_url, initial_url, soup) + for link in internal_links: + if link not in visited_links: + to_visit.append(link) + + if page_response and str(page_response.status)[0] in ("4", "5"): + last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response" + logger.info(last_error) + continue + + parsed_html = web_html_cleanup(soup, self.mintlify_cleanup) + + """For websites containing iframes that need to be scraped, + the code below can extract text from within these iframes. + """ + logger.debug( + f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}" + ) + if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: + iframe_count = ( + page.frame_locator("iframe").locator("html").count() + ) + if iframe_count > 0: + iframe_texts = ( + page.frame_locator("iframe") + .locator("html") + .all_inner_texts() + ) + document_text = "\n".join(iframe_texts) + """ 700 is the threshold value for the length of the text extracted + from the iframe based on the issue faced """ + if ( + len(parsed_html.cleaned_text) + < IFRAME_TEXT_LENGTH_THRESHOLD + ): + parsed_html.cleaned_text = document_text + else: + parsed_html.cleaned_text += "\n" + document_text + + # Sometimes pages with #! will serve duplicate content + # There are also just other ways this can happen + hashed_text = hash((parsed_html.title, parsed_html.cleaned_text)) + if hashed_text in content_hashes: + logger.info( + f"{index}: Skipping duplicate title + content for {initial_url}" + ) + continue + content_hashes.add(hashed_text) doc_batch.append( Document( id=initial_url, - sections=[TextSection(link=initial_url, text=page_text)], + sections=[ + TextSection( + link=initial_url, text=parsed_html.cleaned_text + ) + ], source=DocumentSource.WEB, - semantic_identifier=initial_url.split("/")[-1], - metadata=metadata, + semantic_identifier=parsed_html.title or initial_url, + metadata={}, doc_updated_at=_get_datetime_from_last_modified_header( last_modified ) @@ -420,118 +667,15 @@ class WebConnector(LoadConnector): else None, ) ) + + page.close() + except Exception as e: + last_error = f"Failed to fetch '{initial_url}': {e}" + logger.exception(last_error) + playwright.stop() + restart_playwright = True continue - page = context.new_page() - - # Can't use wait_until="networkidle" because it interferes with the scrolling behavior - page_response = page.goto( - initial_url, - timeout=30000, # 30 seconds - ) - - last_modified = ( - page_response.header_value("Last-Modified") - if page_response - else None - ) - final_url = page.url - if final_url != initial_url: - protected_url_check(final_url) - initial_url = final_url - if initial_url in visited_links: - logger.info( - f"{index}: {initial_url} redirected to {final_url} - already indexed" - ) - continue - logger.info(f"{index}: {initial_url} redirected to {final_url}") - visited_links.add(initial_url) - - if self.scroll_before_scraping: - scroll_attempts = 0 - previous_height = page.evaluate("document.body.scrollHeight") - while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: - page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - page.wait_for_load_state("networkidle", timeout=30000) - new_height = page.evaluate("document.body.scrollHeight") - if new_height == previous_height: - break # Stop scrolling when no more content is loaded - previous_height = new_height - scroll_attempts += 1 - - content = page.content() - soup = BeautifulSoup(content, "html.parser") - - if self.recursive: - internal_links = get_internal_links(base_url, initial_url, soup) - for link in internal_links: - if link not in visited_links: - to_visit.append(link) - - if page_response and str(page_response.status)[0] in ("4", "5"): - last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response" - logger.info(last_error) - continue - - parsed_html = web_html_cleanup(soup, self.mintlify_cleanup) - - """For websites containing iframes that need to be scraped, - the code below can extract text from within these iframes. - """ - logger.debug( - f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}" - ) - if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: - iframe_count = page.frame_locator("iframe").locator("html").count() - if iframe_count > 0: - iframe_texts = ( - page.frame_locator("iframe") - .locator("html") - .all_inner_texts() - ) - document_text = "\n".join(iframe_texts) - """ 700 is the threshold value for the length of the text extracted - from the iframe based on the issue faced """ - if len(parsed_html.cleaned_text) < IFRAME_TEXT_LENGTH_THRESHOLD: - parsed_html.cleaned_text = document_text - else: - parsed_html.cleaned_text += "\n" + document_text - - # Sometimes pages with #! will serve duplicate content - # There are also just other ways this can happen - hashed_text = hash((parsed_html.title, parsed_html.cleaned_text)) - if hashed_text in content_hashes: - logger.info( - f"{index}: Skipping duplicate title + content for {initial_url}" - ) - continue - content_hashes.add(hashed_text) - - doc_batch.append( - Document( - id=initial_url, - sections=[ - TextSection(link=initial_url, text=parsed_html.cleaned_text) - ], - source=DocumentSource.WEB, - semantic_identifier=parsed_html.title or initial_url, - metadata={}, - doc_updated_at=_get_datetime_from_last_modified_header( - last_modified - ) - if last_modified - else None, - ) - ) - - page.close() - except Exception as e: - last_error = f"Failed to fetch '{initial_url}': {e}" - logger.exception(last_error) - playwright.stop() - restart_playwright = True - continue - if len(doc_batch) >= self.batch_size: playwright.stop() restart_playwright = True