mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-08-29 15:15:03 +02:00
Fix flakey web test (#4551)
* Fix flakey web test * Increase wait time * Another attempt to fix * Simplify + add new test * Fix web tests
This commit is contained in:
@@ -50,6 +50,17 @@ class ScrapeSessionContext:
|
||||
def __init__(self, base_url: str, to_visit: list[str]):
|
||||
self.base_url = base_url
|
||||
self.to_visit = to_visit
|
||||
self.visited_links: set[str] = set()
|
||||
self.content_hashes: set[int] = set()
|
||||
|
||||
self.doc_batch: list[Document] = []
|
||||
|
||||
self.at_least_one_doc: bool = False
|
||||
self.last_error: str | None = None
|
||||
self.needs_retry: bool = False
|
||||
|
||||
self.playwright: Playwright | None = None
|
||||
self.playwright_context: BrowserContext | None = None
|
||||
|
||||
def initialize(self) -> None:
|
||||
self.stop()
|
||||
@@ -64,21 +75,6 @@ class ScrapeSessionContext:
|
||||
self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
base_url: str
|
||||
to_visit: list[str]
|
||||
playwright: Playwright | None = None
|
||||
playwright_context: BrowserContext | None = None
|
||||
|
||||
visited_links: set[str] = set()
|
||||
content_hashes: set[int] = set()
|
||||
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
at_least_one_doc: bool = False
|
||||
last_error: str | None = None
|
||||
|
||||
needs_retry: bool = False
|
||||
|
||||
|
||||
class ScrapeResult:
|
||||
doc: Document | None = None
|
||||
@@ -177,9 +173,6 @@ def check_internet_connection(url: str) -> None:
|
||||
session = requests.Session()
|
||||
session.headers.update(DEFAULT_HEADERS)
|
||||
|
||||
# Add a random delay to mimic human behavior
|
||||
time.sleep(random.uniform(0.1, 0.5))
|
||||
|
||||
response = session.get(url, timeout=5, allow_redirects=True)
|
||||
|
||||
response.raise_for_status()
|
||||
@@ -445,7 +438,6 @@ class WebConnector(LoadConnector):
|
||||
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
scroll_before_scraping: bool = False,
|
||||
add_randomness: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self.mintlify_cleanup = mintlify_cleanup
|
||||
@@ -453,7 +445,6 @@ class WebConnector(LoadConnector):
|
||||
self.recursive = False
|
||||
self.scroll_before_scraping = scroll_before_scraping
|
||||
self.web_connector_type = web_connector_type
|
||||
self.add_randomness = add_randomness
|
||||
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
||||
self.recursive = True
|
||||
self.to_visit_list = [_ensure_valid_url(base_url)]
|
||||
@@ -538,10 +529,6 @@ class WebConnector(LoadConnector):
|
||||
|
||||
page = session_ctx.playwright_context.new_page()
|
||||
try:
|
||||
if self.add_randomness:
|
||||
# Add random mouse movements and scrolling to mimic human behavior
|
||||
page.mouse.move(random.randint(100, 700), random.randint(100, 500))
|
||||
|
||||
# Can't use wait_until="networkidle" because it interferes with the scrolling behavior
|
||||
page_response = page.goto(
|
||||
initial_url,
|
||||
@@ -549,9 +536,6 @@ class WebConnector(LoadConnector):
|
||||
wait_until="domcontentloaded", # Wait for DOM to be ready
|
||||
)
|
||||
|
||||
# Add a small random delay to mimic human behavior
|
||||
time.sleep(random.uniform(0.5, 2.0))
|
||||
|
||||
last_modified = (
|
||||
page_response.header_value("Last-Modified") if page_response else None
|
||||
)
|
||||
@@ -575,7 +559,10 @@ class WebConnector(LoadConnector):
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
# wait for the content to load if we scrolled
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
time.sleep(0.5) # let javascript run
|
||||
|
||||
new_height = page.evaluate("document.body.scrollHeight")
|
||||
if new_height == previous_height:
|
||||
break # Stop scrolling when no more content is loaded
|
||||
|
@@ -12,21 +12,20 @@ EXPECTED_QUOTE = (
|
||||
# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
|
||||
# to avoid depending on a third party site
|
||||
@pytest.fixture
|
||||
def web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
||||
def quotes_to_scroll_web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
||||
scroll_before_scraping = request.param
|
||||
connector = WebConnector(
|
||||
base_url="https://quotes.toscrape.com/scroll",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
scroll_before_scraping=scroll_before_scraping,
|
||||
add_randomness=False,
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [True], indirect=True)
|
||||
def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||
@pytest.mark.parametrize("quotes_to_scroll_web_connector", [True], indirect=True)
|
||||
def test_web_connector_scroll(quotes_to_scroll_web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
document_batches = quotes_to_scroll_web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
@@ -37,10 +36,10 @@ def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||
assert EXPECTED_QUOTE in doc.sections[0].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [False], indirect=True)
|
||||
def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||
@pytest.mark.parametrize("quotes_to_scroll_web_connector", [False], indirect=True)
|
||||
def test_web_connector_no_scroll(quotes_to_scroll_web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
document_batches = quotes_to_scroll_web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
@@ -49,3 +48,20 @@ def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||
doc = all_docs[0]
|
||||
assert doc.sections[0].text is not None
|
||||
assert EXPECTED_QUOTE not in doc.sections[0].text
|
||||
|
||||
|
||||
MERCURY_EXPECTED_QUOTE = "How can we help?"
|
||||
|
||||
|
||||
def test_web_connector_bot_protection() -> None:
|
||||
connector = WebConnector(
|
||||
base_url="https://support.mercury.com/hc",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
document_batches = list(connector.load_from_state())
|
||||
assert len(document_batches) == 1
|
||||
doc_batch = document_batches[0]
|
||||
assert len(doc_batch) == 1
|
||||
doc = doc_batch[0]
|
||||
assert doc.sections[0].text is not None
|
||||
assert MERCURY_EXPECTED_QUOTE in doc.sections[0].text
|
||||
|
Reference in New Issue
Block a user