mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-07 13:39:50 +02:00
refactor a mega function for readability and make sure to increment r… (#4542)
* refactor a mega function for readability and make sure to increment retry_count on exception so that we don't infinitely loop * improve session and page level context handling * don't use pydantic for the session context * we don't need retry success * move playwright handling into the session context * need to break on ok * return doc from scrape * fix comment --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
parent
6df1c6c72f
commit
04ebde7838
@ -43,6 +43,48 @@ from shared_configs.configs import MULTI_TENANT
|
|||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeSessionContext:
|
||||||
|
"""Session level context for scraping"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str, to_visit: list[str]):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.to_visit = to_visit
|
||||||
|
|
||||||
|
def initialize(self) -> None:
|
||||||
|
self.stop()
|
||||||
|
self.playwright, self.playwright_context = start_playwright()
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
if self.playwright_context:
|
||||||
|
self.playwright_context.close()
|
||||||
|
self.playwright_context = None
|
||||||
|
|
||||||
|
if self.playwright:
|
||||||
|
self.playwright.stop()
|
||||||
|
self.playwright = None
|
||||||
|
|
||||||
|
base_url: str
|
||||||
|
to_visit: list[str]
|
||||||
|
playwright: Playwright | None = None
|
||||||
|
playwright_context: BrowserContext | None = None
|
||||||
|
|
||||||
|
visited_links: set[str] = set()
|
||||||
|
content_hashes: set[int] = set()
|
||||||
|
|
||||||
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
|
at_least_one_doc: bool = False
|
||||||
|
last_error: str | None = None
|
||||||
|
|
||||||
|
needs_retry: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResult:
|
||||||
|
doc: Document | None = None
|
||||||
|
retry: bool = False
|
||||||
|
|
||||||
|
|
||||||
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
||||||
# Threshold for determining when to replace vs append iframe content
|
# Threshold for determining when to replace vs append iframe content
|
||||||
IFRAME_TEXT_LENGTH_THRESHOLD = 700
|
IFRAME_TEXT_LENGTH_THRESHOLD = 700
|
||||||
@ -394,6 +436,8 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
|
|||||||
|
|
||||||
|
|
||||||
class WebConnector(LoadConnector):
|
class WebConnector(LoadConnector):
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
base_url: str, # Can't change this without disrupting existing users
|
base_url: str, # Can't change this without disrupting existing users
|
||||||
@ -444,65 +488,24 @@ class WebConnector(LoadConnector):
|
|||||||
logger.warning("Unexpected credentials provided for Web Connector")
|
logger.warning("Unexpected credentials provided for Web Connector")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
def _do_scrape(
|
||||||
"""Traverses through all pages found on the website
|
self,
|
||||||
and converts them into documents"""
|
index: int,
|
||||||
visited_links: set[str] = set()
|
initial_url: str,
|
||||||
to_visit: list[str] = self.to_visit_list
|
session_ctx: ScrapeSessionContext,
|
||||||
content_hashes = set()
|
) -> ScrapeResult:
|
||||||
|
"""Returns a ScrapeResult object with a doc and retry flag."""
|
||||||
|
|
||||||
if not to_visit:
|
if session_ctx.playwright is None:
|
||||||
raise ValueError("No URLs to visit")
|
raise RuntimeError("scrape_context.playwright is None")
|
||||||
|
|
||||||
base_url = to_visit[0] # For the recursive case
|
if session_ctx.playwright_context is None:
|
||||||
doc_batch: list[Document] = []
|
raise RuntimeError("scrape_context.playwright_context is None")
|
||||||
|
|
||||||
# make sure we can connect to the base url
|
result = ScrapeResult()
|
||||||
check_internet_connection(base_url)
|
|
||||||
|
|
||||||
# Needed to report error
|
|
||||||
at_least_one_doc = False
|
|
||||||
last_error = None
|
|
||||||
|
|
||||||
playwright, context = start_playwright()
|
|
||||||
restart_playwright = False
|
|
||||||
while to_visit:
|
|
||||||
initial_url = to_visit.pop()
|
|
||||||
if initial_url in visited_links:
|
|
||||||
continue
|
|
||||||
visited_links.add(initial_url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
protected_url_check(initial_url)
|
|
||||||
except Exception as e:
|
|
||||||
last_error = f"Invalid URL {initial_url} due to {e}"
|
|
||||||
logger.warning(last_error)
|
|
||||||
continue
|
|
||||||
|
|
||||||
index = len(visited_links)
|
|
||||||
logger.info(f"{index}: Visiting {initial_url}")
|
|
||||||
|
|
||||||
# Add retry mechanism with exponential backoff
|
|
||||||
max_retries = 3
|
|
||||||
retry_count = 0
|
|
||||||
retry_success = False
|
|
||||||
|
|
||||||
while retry_count < max_retries and not retry_success:
|
|
||||||
try:
|
|
||||||
if retry_count > 0:
|
|
||||||
# Add a random delay between retries (exponential backoff)
|
|
||||||
delay = min(2**retry_count + random.uniform(0, 1), 10)
|
|
||||||
logger.info(
|
|
||||||
f"Retry {retry_count}/{max_retries} for {initial_url} after {delay:.2f}s delay"
|
|
||||||
)
|
|
||||||
time.sleep(delay)
|
|
||||||
|
|
||||||
if restart_playwright:
|
|
||||||
playwright, context = start_playwright()
|
|
||||||
restart_playwright = False
|
|
||||||
|
|
||||||
# Handle cookies for the URL
|
# Handle cookies for the URL
|
||||||
_handle_cookies(context, initial_url)
|
_handle_cookies(session_ctx.playwright_context, initial_url)
|
||||||
|
|
||||||
# First do a HEAD request to check content type without downloading the entire content
|
# First do a HEAD request to check content type without downloading the entire content
|
||||||
head_response = requests.head(
|
head_response = requests.head(
|
||||||
@ -518,34 +521,26 @@ class WebConnector(LoadConnector):
|
|||||||
)
|
)
|
||||||
last_modified = response.headers.get("Last-Modified")
|
last_modified = response.headers.get("Last-Modified")
|
||||||
|
|
||||||
doc_batch.append(
|
result.doc = Document(
|
||||||
Document(
|
|
||||||
id=initial_url,
|
id=initial_url,
|
||||||
sections=[
|
sections=[TextSection(link=initial_url, text=page_text)],
|
||||||
TextSection(link=initial_url, text=page_text)
|
|
||||||
],
|
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=initial_url.split("/")[-1],
|
semantic_identifier=initial_url.split("/")[-1],
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
doc_updated_at=(
|
doc_updated_at=(
|
||||||
_get_datetime_from_last_modified_header(
|
_get_datetime_from_last_modified_header(last_modified)
|
||||||
last_modified
|
|
||||||
)
|
|
||||||
if last_modified
|
if last_modified
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
|
||||||
retry_success = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
page = context.new_page()
|
return result
|
||||||
|
|
||||||
|
page = session_ctx.playwright_context.new_page()
|
||||||
|
try:
|
||||||
if self.add_randomness:
|
if self.add_randomness:
|
||||||
# Add random mouse movements and scrolling to mimic human behavior
|
# Add random mouse movements and scrolling to mimic human behavior
|
||||||
page.mouse.move(
|
page.mouse.move(random.randint(100, 700), random.randint(100, 500))
|
||||||
random.randint(100, 700), random.randint(100, 500)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
|
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
|
||||||
page_response = page.goto(
|
page_response = page.goto(
|
||||||
@ -557,44 +552,29 @@ class WebConnector(LoadConnector):
|
|||||||
# Add a small random delay to mimic human behavior
|
# Add a small random delay to mimic human behavior
|
||||||
time.sleep(random.uniform(0.5, 2.0))
|
time.sleep(random.uniform(0.5, 2.0))
|
||||||
|
|
||||||
# Check if we got a 403 error
|
|
||||||
if page_response and page_response.status == 403:
|
|
||||||
logger.warning(
|
|
||||||
f"Received 403 Forbidden for {initial_url}, retrying..."
|
|
||||||
)
|
|
||||||
page.close()
|
|
||||||
retry_count += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
last_modified = (
|
last_modified = (
|
||||||
page_response.header_value("Last-Modified")
|
page_response.header_value("Last-Modified") if page_response else None
|
||||||
if page_response
|
|
||||||
else None
|
|
||||||
)
|
)
|
||||||
final_url = page.url
|
final_url = page.url
|
||||||
if final_url != initial_url:
|
if final_url != initial_url:
|
||||||
protected_url_check(final_url)
|
protected_url_check(final_url)
|
||||||
initial_url = final_url
|
initial_url = final_url
|
||||||
if initial_url in visited_links:
|
if initial_url in session_ctx.visited_links:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{index}: {initial_url} redirected to {final_url} - already indexed"
|
f"{index}: {initial_url} redirected to {final_url} - already indexed"
|
||||||
)
|
)
|
||||||
page.close()
|
page.close()
|
||||||
retry_success = True
|
return result
|
||||||
continue
|
|
||||||
logger.info(f"{index}: {initial_url} redirected to {final_url}")
|
logger.info(f"{index}: {initial_url} redirected to {final_url}")
|
||||||
visited_links.add(initial_url)
|
session_ctx.visited_links.add(initial_url)
|
||||||
|
|
||||||
# If we got here, the request was successful
|
# If we got here, the request was successful
|
||||||
retry_success = True
|
|
||||||
|
|
||||||
if self.scroll_before_scraping:
|
if self.scroll_before_scraping:
|
||||||
scroll_attempts = 0
|
scroll_attempts = 0
|
||||||
previous_height = page.evaluate("document.body.scrollHeight")
|
previous_height = page.evaluate("document.body.scrollHeight")
|
||||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||||
page.evaluate(
|
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
"window.scrollTo(0, document.body.scrollHeight)"
|
|
||||||
)
|
|
||||||
page.wait_for_load_state("networkidle", timeout=30000)
|
page.wait_for_load_state("networkidle", timeout=30000)
|
||||||
new_height = page.evaluate("document.body.scrollHeight")
|
new_height = page.evaluate("document.body.scrollHeight")
|
||||||
if new_height == previous_height:
|
if new_height == previous_height:
|
||||||
@ -606,16 +586,20 @@ class WebConnector(LoadConnector):
|
|||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
if self.recursive:
|
if self.recursive:
|
||||||
internal_links = get_internal_links(base_url, initial_url, soup)
|
internal_links = get_internal_links(
|
||||||
|
session_ctx.base_url, initial_url, soup
|
||||||
|
)
|
||||||
for link in internal_links:
|
for link in internal_links:
|
||||||
if link not in visited_links:
|
if link not in session_ctx.visited_links:
|
||||||
to_visit.append(link)
|
session_ctx.to_visit.append(link)
|
||||||
|
|
||||||
if page_response and str(page_response.status)[0] in ("4", "5"):
|
if page_response and str(page_response.status)[0] in ("4", "5"):
|
||||||
last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response"
|
session_ctx.last_error = f"Skipped indexing {initial_url} due to HTTP {page_response.status} response"
|
||||||
logger.info(last_error)
|
logger.info(session_ctx.last_error)
|
||||||
continue
|
result.retry = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
# after this point, we don't need the caller to retry
|
||||||
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
||||||
|
|
||||||
"""For websites containing iframes that need to be scraped,
|
"""For websites containing iframes that need to be scraped,
|
||||||
@ -625,22 +609,15 @@ class WebConnector(LoadConnector):
|
|||||||
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
|
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
|
||||||
)
|
)
|
||||||
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
||||||
iframe_count = (
|
iframe_count = page.frame_locator("iframe").locator("html").count()
|
||||||
page.frame_locator("iframe").locator("html").count()
|
|
||||||
)
|
|
||||||
if iframe_count > 0:
|
if iframe_count > 0:
|
||||||
iframe_texts = (
|
iframe_texts = (
|
||||||
page.frame_locator("iframe")
|
page.frame_locator("iframe").locator("html").all_inner_texts()
|
||||||
.locator("html")
|
|
||||||
.all_inner_texts()
|
|
||||||
)
|
)
|
||||||
document_text = "\n".join(iframe_texts)
|
document_text = "\n".join(iframe_texts)
|
||||||
""" 700 is the threshold value for the length of the text extracted
|
""" 700 is the threshold value for the length of the text extracted
|
||||||
from the iframe based on the issue faced """
|
from the iframe based on the issue faced """
|
||||||
if (
|
if len(parsed_html.cleaned_text) < IFRAME_TEXT_LENGTH_THRESHOLD:
|
||||||
len(parsed_html.cleaned_text)
|
|
||||||
< IFRAME_TEXT_LENGTH_THRESHOLD
|
|
||||||
):
|
|
||||||
parsed_html.cleaned_text = document_text
|
parsed_html.cleaned_text = document_text
|
||||||
else:
|
else:
|
||||||
parsed_html.cleaned_text += "\n" + document_text
|
parsed_html.cleaned_text += "\n" + document_text
|
||||||
@ -648,21 +625,17 @@ class WebConnector(LoadConnector):
|
|||||||
# Sometimes pages with #! will serve duplicate content
|
# Sometimes pages with #! will serve duplicate content
|
||||||
# There are also just other ways this can happen
|
# There are also just other ways this can happen
|
||||||
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
|
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
|
||||||
if hashed_text in content_hashes:
|
if hashed_text in session_ctx.content_hashes:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{index}: Skipping duplicate title + content for {initial_url}"
|
f"{index}: Skipping duplicate title + content for {initial_url}"
|
||||||
)
|
)
|
||||||
continue
|
return result
|
||||||
content_hashes.add(hashed_text)
|
|
||||||
|
|
||||||
doc_batch.append(
|
session_ctx.content_hashes.add(hashed_text)
|
||||||
Document(
|
|
||||||
|
result.doc = Document(
|
||||||
id=initial_url,
|
id=initial_url,
|
||||||
sections=[
|
sections=[TextSection(link=initial_url, text=parsed_html.cleaned_text)],
|
||||||
TextSection(
|
|
||||||
link=initial_url, text=parsed_html.cleaned_text
|
|
||||||
)
|
|
||||||
],
|
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=parsed_html.title or initial_url,
|
semantic_identifier=parsed_html.title or initial_url,
|
||||||
metadata={},
|
metadata={},
|
||||||
@ -672,33 +645,87 @@ class WebConnector(LoadConnector):
|
|||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
finally:
|
||||||
|
|
||||||
page.close()
|
page.close()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
|
"""Traverses through all pages found on the website
|
||||||
|
and converts them into documents"""
|
||||||
|
|
||||||
|
if not self.to_visit_list:
|
||||||
|
raise ValueError("No URLs to visit")
|
||||||
|
|
||||||
|
base_url = self.to_visit_list[0] # For the recursive case
|
||||||
|
check_internet_connection(base_url) # make sure we can connect to the base url
|
||||||
|
|
||||||
|
session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
|
||||||
|
session_ctx.initialize()
|
||||||
|
|
||||||
|
while session_ctx.to_visit:
|
||||||
|
initial_url = session_ctx.to_visit.pop()
|
||||||
|
if initial_url in session_ctx.visited_links:
|
||||||
|
continue
|
||||||
|
session_ctx.visited_links.add(initial_url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
protected_url_check(initial_url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
last_error = f"Failed to fetch '{initial_url}': {e}"
|
session_ctx.last_error = f"Invalid URL {initial_url} due to {e}"
|
||||||
logger.exception(last_error)
|
logger.warning(session_ctx.last_error)
|
||||||
playwright.stop()
|
|
||||||
restart_playwright = True
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(doc_batch) >= self.batch_size:
|
index = len(session_ctx.visited_links)
|
||||||
playwright.stop()
|
logger.info(f"{index}: Visiting {initial_url}")
|
||||||
restart_playwright = True
|
|
||||||
at_least_one_doc = True
|
|
||||||
yield doc_batch
|
|
||||||
doc_batch = []
|
|
||||||
|
|
||||||
if doc_batch:
|
# Add retry mechanism with exponential backoff
|
||||||
playwright.stop()
|
retry_count = 0
|
||||||
at_least_one_doc = True
|
|
||||||
yield doc_batch
|
|
||||||
|
|
||||||
if not at_least_one_doc:
|
while retry_count < self.MAX_RETRIES:
|
||||||
if last_error:
|
if retry_count > 0:
|
||||||
raise RuntimeError(last_error)
|
# Add a random delay between retries (exponential backoff)
|
||||||
|
delay = min(2**retry_count + random.uniform(0, 1), 10)
|
||||||
|
logger.info(
|
||||||
|
f"Retry {retry_count}/{self.MAX_RETRIES} for {initial_url} after {delay:.2f}s delay"
|
||||||
|
)
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = self._do_scrape(index, initial_url, session_ctx)
|
||||||
|
if result.retry:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if result.doc:
|
||||||
|
session_ctx.doc_batch.append(result.doc)
|
||||||
|
except Exception as e:
|
||||||
|
session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
|
||||||
|
logger.exception(session_ctx.last_error)
|
||||||
|
session_ctx.initialize()
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
retry_count += 1
|
||||||
|
|
||||||
|
break # success / don't retry
|
||||||
|
|
||||||
|
if len(session_ctx.doc_batch) >= self.batch_size:
|
||||||
|
session_ctx.initialize()
|
||||||
|
session_ctx.at_least_one_doc = True
|
||||||
|
yield session_ctx.doc_batch
|
||||||
|
session_ctx.doc_batch = []
|
||||||
|
|
||||||
|
if session_ctx.doc_batch:
|
||||||
|
session_ctx.stop()
|
||||||
|
session_ctx.at_least_one_doc = True
|
||||||
|
yield session_ctx.doc_batch
|
||||||
|
|
||||||
|
if not session_ctx.at_least_one_doc:
|
||||||
|
if session_ctx.last_error:
|
||||||
|
raise RuntimeError(session_ctx.last_error)
|
||||||
raise RuntimeError("No valid pages found.")
|
raise RuntimeError("No valid pages found.")
|
||||||
|
|
||||||
|
session_ctx.stop()
|
||||||
|
|
||||||
def validate_connector_settings(self) -> None:
|
def validate_connector_settings(self) -> None:
|
||||||
# Make sure we have at least one valid URL to check
|
# Make sure we have at least one valid URL to check
|
||||||
if not self.to_visit_list:
|
if not self.to_visit_list:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user