refactor a mega function for readability and make sure to increment r… (#4542)

* refactor a mega function for readability and make sure to increment retry_count on exception so that we don't infinitely loop

* improve session and page level context handling

* don't use pydantic for the session context

* we don't need retry success

* move playwright handling into the session context

* need to break on ok

* return doc from scrape

* fix comment

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-04-16 23:43:30 -07:00 committed by GitHub
parent 6df1c6c72f
commit 04ebde7838
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -43,6 +43,48 @@ from shared_configs.configs import MULTI_TENANT
logger = setup_logger() logger = setup_logger()
class ScrapeSessionContext:
"""Session level context for scraping"""
def __init__(self, base_url: str, to_visit: list[str]):
self.base_url = base_url
self.to_visit = to_visit
def initialize(self) -> None:
self.stop()
self.playwright, self.playwright_context = start_playwright()
def stop(self) -> None:
if self.playwright_context:
self.playwright_context.close()
self.playwright_context = None
if self.playwright:
self.playwright.stop()
self.playwright = None
base_url: str
to_visit: list[str]
playwright: Playwright | None = None
playwright_context: BrowserContext | None = None
visited_links: set[str] = set()
content_hashes: set[int] = set()
doc_batch: list[Document] = []
at_least_one_doc: bool = False
last_error: str | None = None
needs_retry: bool = False
class ScrapeResult:
doc: Document | None = None
retry: bool = False
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20 WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
# Threshold for determining when to replace vs append iframe content # Threshold for determining when to replace vs append iframe content
IFRAME_TEXT_LENGTH_THRESHOLD = 700 IFRAME_TEXT_LENGTH_THRESHOLD = 700
@ -394,6 +436,8 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
class WebConnector(LoadConnector): class WebConnector(LoadConnector):
MAX_RETRIES = 3
def __init__( def __init__(
self, self,
base_url: str, # Can't change this without disrupting existing users base_url: str, # Can't change this without disrupting existing users
@ -444,65 +488,24 @@ class WebConnector(LoadConnector):
logger.warning("Unexpected credentials provided for Web Connector") logger.warning("Unexpected credentials provided for Web Connector")
return None return None
def load_from_state(self) -> GenerateDocumentsOutput: def _do_scrape(
"""Traverses through all pages found on the website self,
and converts them into documents""" index: int,
visited_links: set[str] = set() initial_url: str,
to_visit: list[str] = self.to_visit_list session_ctx: ScrapeSessionContext,
content_hashes = set() ) -> ScrapeResult:
"""Returns a ScrapeResult object with a doc and retry flag."""
if not to_visit: if session_ctx.playwright is None:
raise ValueError("No URLs to visit") raise RuntimeError("scrape_context.playwright is None")
base_url = to_visit[0] # For the recursive case if session_ctx.playwright_context is None:
doc_batch: list[Document] = [] raise RuntimeError("scrape_context.playwright_context is None")
# make sure we can connect to the base url result = ScrapeResult()
check_internet_connection(base_url)
# Needed to report error
at_least_one_doc = False
last_error = None
playwright, context = start_playwright()
restart_playwright = False
while to_visit:
initial_url = to_visit.pop()
if initial_url in visited_links:
continue
visited_links.add(initial_url)
try:
protected_url_check(initial_url)
except Exception as e:
last_error = f"Invalid URL {initial_url} due to {e}"
logger.warning(last_error)
continue
index = len(visited_links)
logger.info(f"{index}: Visiting {initial_url}")
# Add retry mechanism with exponential backoff
max_retries = 3
retry_count = 0
retry_success = False
while retry_count < max_retries and not retry_success:
try:
if retry_count > 0:
# Add a random delay between retries (exponential backoff)
delay = min(2**retry_count + random.uniform(0, 1), 10)
logger.info(
f"Retry {retry_count}/{max_retries} for {initial_url} after {delay:.2f}s delay"
)
time.sleep(delay)
if restart_playwright:
playwright, context = start_playwright()
restart_playwright = False
# Handle cookies for the URL # Handle cookies for the URL
_handle_cookies(context, initial_url) _handle_cookies(session_ctx.playwright_context, initial_url)
# First do a HEAD request to check content type without downloading the entire content # First do a HEAD request to check content type without downloading the entire content
head_response = requests.head( head_response = requests.head(
@ -518,34 +521,26 @@ class WebConnector(LoadConnector):
) )
last_modified = response.headers.get("Last-Modified") last_modified = response.headers.get("Last-Modified")
doc_batch.append( result.doc = Document(
Document(
id=initial_url, id=initial_url,
sections=[ sections=[TextSection(link=initial_url, text=page_text)],
TextSection(link=initial_url, text=page_text)
],
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=initial_url.split("/")[-1], semantic_identifier=initial_url.split("/")[-1],
metadata=metadata, metadata=metadata,
doc_updated_at=( doc_updated_at=(
_get_datetime_from_last_modified_header( _get_datetime_from_last_modified_header(last_modified)
last_modified
)
if last_modified if last_modified
else None else None
), ),
) )
)
retry_success = True
continue
page = context.new_page() return result
page = session_ctx.playwright_context.new_page()
try:
if self.add_randomness: if self.add_randomness:
# Add random mouse movements and scrolling to mimic human behavior # Add random mouse movements and scrolling to mimic human behavior
page.mouse.move( page.mouse.move(random.randint(100, 700), random.randint(100, 500))
random.randint(100, 700), random.randint(100, 500)
)
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior # Can't use wait_until="networkidle" because it interferes with the scrolling behavior
page_response = page.goto( page_response = page.goto(
@ -557,44 +552,29 @@ class WebConnector(LoadConnector):
# Add a small random delay to mimic human behavior # Add a small random delay to mimic human behavior
time.sleep(random.uniform(0.5, 2.0)) time.sleep(random.uniform(0.5, 2.0))
# Check if we got a 403 error
if page_response and page_response.status == 403:
logger.warning(
f"Received 403 Forbidden for {initial_url}, retrying..."
)
page.close()
retry_count += 1
continue
last_modified = ( last_modified = (
page_response.header_value("Last-Modified") page_response.header_value("Last-Modified") if page_response else None
if page_response
else None
) )
final_url = page.url final_url = page.url
if final_url != initial_url: if final_url != initial_url:
protected_url_check(final_url) protected_url_check(final_url)
initial_url = final_url initial_url = final_url
if initial_url in visited_links: if initial_url in session_ctx.visited_links:
logger.info( logger.info(
f"{index}: {initial_url} redirected to {final_url} - already indexed" f"{index}: {initial_url} redirected to {final_url} - already indexed"
) )
page.close() page.close()
retry_success = True return result
continue
logger.info(f"{index}: {initial_url} redirected to {final_url}") logger.info(f"{index}: {initial_url} redirected to {final_url}")
visited_links.add(initial_url) session_ctx.visited_links.add(initial_url)
# If we got here, the request was successful # If we got here, the request was successful
retry_success = True
if self.scroll_before_scraping: if self.scroll_before_scraping:
scroll_attempts = 0 scroll_attempts = 0
previous_height = page.evaluate("document.body.scrollHeight") previous_height = page.evaluate("document.body.scrollHeight")
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
page.evaluate( page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
"window.scrollTo(0, document.body.scrollHeight)"
)
page.wait_for_load_state("networkidle", timeout=30000) page.wait_for_load_state("networkidle", timeout=30000)
new_height = page.evaluate("document.body.scrollHeight") new_height = page.evaluate("document.body.scrollHeight")
if new_height == previous_height: if new_height == previous_height:
@ -606,16 +586,20 @@ class WebConnector(LoadConnector):
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
if self.recursive: if self.recursive:
internal_links = get_internal_links(base_url, initial_url, soup) internal_links = get_internal_links(
session_ctx.base_url, initial_url, soup
)
for link in internal_links: for link in internal_links:
if link not in visited_links: if link not in session_ctx.visited_links:
to_visit.append(link) session_ctx.to_visit.append(link)
if page_response and str(page_response.status)[0] in ("4", "5"): if page_response and str(page_response.status)[0] in ("4", "5"):
last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response" session_ctx.last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response"
logger.info(last_error) logger.info(session_ctx.last_error)
continue result.retry = True
return result
# after this point, we don't need the caller to retry
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup) parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
"""For websites containing iframes that need to be scraped, """For websites containing iframes that need to be scraped,
@ -625,22 +609,15 @@ class WebConnector(LoadConnector):
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}" f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
) )
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
iframe_count = ( iframe_count = page.frame_locator("iframe").locator("html").count()
page.frame_locator("iframe").locator("html").count()
)
if iframe_count > 0: if iframe_count > 0:
iframe_texts = ( iframe_texts = (
page.frame_locator("iframe") page.frame_locator("iframe").locator("html").all_inner_texts()
.locator("html")
.all_inner_texts()
) )
document_text = "\n".join(iframe_texts) document_text = "\n".join(iframe_texts)
""" 700 is the threshold value for the length of the text extracted """ 700 is the threshold value for the length of the text extracted
from the iframe based on the issue faced """ from the iframe based on the issue faced """
if ( if len(parsed_html.cleaned_text) < IFRAME_TEXT_LENGTH_THRESHOLD:
len(parsed_html.cleaned_text)
< IFRAME_TEXT_LENGTH_THRESHOLD
):
parsed_html.cleaned_text = document_text parsed_html.cleaned_text = document_text
else: else:
parsed_html.cleaned_text += "\n" + document_text parsed_html.cleaned_text += "\n" + document_text
@ -648,21 +625,17 @@ class WebConnector(LoadConnector):
# Sometimes pages with #! will serve duplicate content # Sometimes pages with #! will serve duplicate content
# There are also just other ways this can happen # There are also just other ways this can happen
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text)) hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
if hashed_text in content_hashes: if hashed_text in session_ctx.content_hashes:
logger.info( logger.info(
f"{index}: Skipping duplicate title + content for {initial_url}" f"{index}: Skipping duplicate title + content for {initial_url}"
) )
continue return result
content_hashes.add(hashed_text)
doc_batch.append( session_ctx.content_hashes.add(hashed_text)
Document(
result.doc = Document(
id=initial_url, id=initial_url,
sections=[ sections=[TextSection(link=initial_url, text=parsed_html.cleaned_text)],
TextSection(
link=initial_url, text=parsed_html.cleaned_text
)
],
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=parsed_html.title or initial_url, semantic_identifier=parsed_html.title or initial_url,
metadata={}, metadata={},
@ -672,33 +645,87 @@ class WebConnector(LoadConnector):
else None else None
), ),
) )
) finally:
page.close() page.close()
return result
def load_from_state(self) -> GenerateDocumentsOutput:
"""Traverses through all pages found on the website
and converts them into documents"""
if not self.to_visit_list:
raise ValueError("No URLs to visit")
base_url = self.to_visit_list[0] # For the recursive case
check_internet_connection(base_url) # make sure we can connect to the base url
session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
session_ctx.initialize()
while session_ctx.to_visit:
initial_url = session_ctx.to_visit.pop()
if initial_url in session_ctx.visited_links:
continue
session_ctx.visited_links.add(initial_url)
try:
protected_url_check(initial_url)
except Exception as e: except Exception as e:
last_error = f"Failed to fetch '{initial_url}': {e}" session_ctx.last_error = f"Invalid URL {initial_url} due to {e}"
logger.exception(last_error) logger.warning(session_ctx.last_error)
playwright.stop()
restart_playwright = True
continue continue
if len(doc_batch) >= self.batch_size: index = len(session_ctx.visited_links)
playwright.stop() logger.info(f"{index}: Visiting {initial_url}")
restart_playwright = True
at_least_one_doc = True
yield doc_batch
doc_batch = []
if doc_batch: # Add retry mechanism with exponential backoff
playwright.stop() retry_count = 0
at_least_one_doc = True
yield doc_batch
if not at_least_one_doc: while retry_count < self.MAX_RETRIES:
if last_error: if retry_count > 0:
raise RuntimeError(last_error) # Add a random delay between retries (exponential backoff)
delay = min(2**retry_count + random.uniform(0, 1), 10)
logger.info(
f"Retry {retry_count}/{self.MAX_RETRIES} for {initial_url} after {delay:.2f}s delay"
)
time.sleep(delay)
try:
result = self._do_scrape(index, initial_url, session_ctx)
if result.retry:
continue
if result.doc:
session_ctx.doc_batch.append(result.doc)
except Exception as e:
session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
logger.exception(session_ctx.last_error)
session_ctx.initialize()
continue
finally:
retry_count += 1
break # success / don't retry
if len(session_ctx.doc_batch) >= self.batch_size:
session_ctx.initialize()
session_ctx.at_least_one_doc = True
yield session_ctx.doc_batch
session_ctx.doc_batch = []
if session_ctx.doc_batch:
session_ctx.stop()
session_ctx.at_least_one_doc = True
yield session_ctx.doc_batch
if not session_ctx.at_least_one_doc:
if session_ctx.last_error:
raise RuntimeError(session_ctx.last_error)
raise RuntimeError("No valid pages found.") raise RuntimeError("No valid pages found.")
session_ctx.stop()
def validate_connector_settings(self) -> None: def validate_connector_settings(self) -> None:
# Make sure we have at least one valid URL to check # Make sure we have at least one valid URL to check
if not self.to_visit_list: if not self.to_visit_list: