fix web connector tests to handle new deduping (#4175)

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-03-03 12:54:20 -08:00 committed by GitHub
parent a52d0d29be
commit 9bb8cdfff1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 26 additions and 14 deletions

View File

@ -320,7 +320,8 @@ class WebConnector(LoadConnector):
logger.warning(last_error)
continue
logger.info(f"{len(visited_links)}: Visiting {initial_url}")
index = len(visited_links)
logger.info(f"{index}: Visiting {initial_url}")
try:
check_internet_connection(initial_url)
@ -371,12 +372,10 @@ class WebConnector(LoadConnector):
initial_url = final_url
if initial_url in visited_links:
logger.info(
f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed"
f"{index}: {initial_url} redirected to {final_url} - already indexed"
)
continue
logger.info(
f"{len(visited_links)}: {initial_url} redirected to {final_url}"
)
logger.info(f"{index}: {initial_url} redirected to {final_url}")
visited_links.add(initial_url)
if self.scroll_before_scraping:
@ -410,7 +409,9 @@ class WebConnector(LoadConnector):
"""For websites containing iframes that need to be scraped,
the code below can extract text from within these iframes.
"""
logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}")
logger.debug(
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
)
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
iframe_count = page.frame_locator("iframe").locator("html").count()
if iframe_count > 0:
@ -427,11 +428,13 @@ class WebConnector(LoadConnector):
else:
parsed_html.cleaned_text += "\n" + document_text
# Sometimes pages with #! will server duplicate content
# Sometimes pages with #! will serve duplicate content
# There are also just other ways this can happen
hashed_text = hash(parsed_html.cleaned_text)
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
if hashed_text in content_hashes:
logger.info(f"Skipping duplicate content for {initial_url}")
logger.info(
f"{index}: Skipping duplicate title + content for {initial_url}"
)
continue
content_hashes.add(hashed_text)

View File

@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
selected_cc_pair = CCPairManager.get_indexing_status_by_id(
cc_pair_1.id, user_performing_action=admin_user
)
assert selected_cc_pair is not None, "cc_pair not found after indexing!"
assert selected_cc_pair.docs_indexed == 15
# used to be 15, but now
# localhost:8889/ and localhost:8889/index.html are deduped
assert selected_cc_pair.docs_indexed == 14
logger.info("Removing about.html.")
os.remove(os.path.join(website_tgt, "about.html"))
@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
cc_pair_1.id, user_performing_action=admin_user
)
assert selected_cc_pair is not None, "cc_pair not found after pruning!"
assert selected_cc_pair.docs_indexed == 13
assert selected_cc_pair.docs_indexed == 12
# check vespa
root_id = f"http://{hostname}:{port}/"
index_id = f"http://{hostname}:{port}/index.html"
about_id = f"http://{hostname}:{port}/about.html"
courses_id = f"http://{hostname}:{port}/courses.html"
doc_ids = [index_id, about_id, courses_id]
doc_ids = [root_id, index_id, about_id, courses_id]
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
retrieved_docs = {
doc["fields"]["document_id"]: doc["fields"]
for doc in retrieved_docs_dict
}
# verify index.html exists in Vespa
retrieved_doc = retrieved_docs.get(index_id)
# verify root exists in Vespa
retrieved_doc = retrieved_docs.get(root_id)
assert retrieved_doc
# verify index.html does not exist in Vespa since it is a duplicate of root
retrieved_doc = retrieved_docs.get(index_id)
assert not retrieved_doc
# verify about and courses do not exist
retrieved_doc = retrieved_docs.get(about_id)
assert not retrieved_doc