fix web connector tests to handle new deduping (#4175)

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer 2025-03-03 12:54:20 -08:00 committed by GitHub
parent a52d0d29be
commit 9bb8cdfff1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 26 additions and 14 deletions

View File

@ -320,7 +320,8 @@ class WebConnector(LoadConnector):
logger.warning(last_error) logger.warning(last_error)
continue continue
logger.info(f"{len(visited_links)}: Visiting {initial_url}") index = len(visited_links)
logger.info(f"{index}: Visiting {initial_url}")
try: try:
check_internet_connection(initial_url) check_internet_connection(initial_url)
@ -371,12 +372,10 @@ class WebConnector(LoadConnector):
initial_url = final_url initial_url = final_url
if initial_url in visited_links: if initial_url in visited_links:
logger.info( logger.info(
f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed" f"{index}: {initial_url} redirected to {final_url} - already indexed"
) )
continue continue
logger.info( logger.info(f"{index}: {initial_url} redirected to {final_url}")
f"{len(visited_links)}: {initial_url} redirected to {final_url}"
)
visited_links.add(initial_url) visited_links.add(initial_url)
if self.scroll_before_scraping: if self.scroll_before_scraping:
@ -410,7 +409,9 @@ class WebConnector(LoadConnector):
"""For websites containing iframes that need to be scraped, """For websites containing iframes that need to be scraped,
the code below can extract text from within these iframes. the code below can extract text from within these iframes.
""" """
logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}") logger.debug(
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
)
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
iframe_count = page.frame_locator("iframe").locator("html").count() iframe_count = page.frame_locator("iframe").locator("html").count()
if iframe_count > 0: if iframe_count > 0:
@ -427,11 +428,13 @@ class WebConnector(LoadConnector):
else: else:
parsed_html.cleaned_text += "\n" + document_text parsed_html.cleaned_text += "\n" + document_text
# Sometimes pages with #! will server duplicate content # Sometimes pages with #! will serve duplicate content
# There are also just other ways this can happen # There are also just other ways this can happen
hashed_text = hash(parsed_html.cleaned_text) hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
if hashed_text in content_hashes: if hashed_text in content_hashes:
logger.info(f"Skipping duplicate content for {initial_url}") logger.info(
f"{index}: Skipping duplicate title + content for {initial_url}"
)
continue continue
content_hashes.add(hashed_text) content_hashes.add(hashed_text)

View File

@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
selected_cc_pair = CCPairManager.get_indexing_status_by_id( selected_cc_pair = CCPairManager.get_indexing_status_by_id(
cc_pair_1.id, user_performing_action=admin_user cc_pair_1.id, user_performing_action=admin_user
) )
assert selected_cc_pair is not None, "cc_pair not found after indexing!" assert selected_cc_pair is not None, "cc_pair not found after indexing!"
assert selected_cc_pair.docs_indexed == 15
# used to be 15, but now
# localhost:8889/ and localhost:8889/index.html are deduped
assert selected_cc_pair.docs_indexed == 14
logger.info("Removing about.html.") logger.info("Removing about.html.")
os.remove(os.path.join(website_tgt, "about.html")) os.remove(os.path.join(website_tgt, "about.html"))
@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
cc_pair_1.id, user_performing_action=admin_user cc_pair_1.id, user_performing_action=admin_user
) )
assert selected_cc_pair is not None, "cc_pair not found after pruning!" assert selected_cc_pair is not None, "cc_pair not found after pruning!"
assert selected_cc_pair.docs_indexed == 13 assert selected_cc_pair.docs_indexed == 12
# check vespa # check vespa
root_id = f"http://{hostname}:{port}/"
index_id = f"http://{hostname}:{port}/index.html" index_id = f"http://{hostname}:{port}/index.html"
about_id = f"http://{hostname}:{port}/about.html" about_id = f"http://{hostname}:{port}/about.html"
courses_id = f"http://{hostname}:{port}/courses.html" courses_id = f"http://{hostname}:{port}/courses.html"
doc_ids = [index_id, about_id, courses_id] doc_ids = [root_id, index_id, about_id, courses_id]
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
retrieved_docs = { retrieved_docs = {
doc["fields"]["document_id"]: doc["fields"] doc["fields"]["document_id"]: doc["fields"]
for doc in retrieved_docs_dict for doc in retrieved_docs_dict
} }
# verify index.html exists in Vespa # verify root exists in Vespa
retrieved_doc = retrieved_docs.get(index_id) retrieved_doc = retrieved_docs.get(root_id)
assert retrieved_doc assert retrieved_doc
# verify index.html does not exist in Vespa since it is a duplicate of root
retrieved_doc = retrieved_docs.get(index_id)
assert not retrieved_doc
# verify about and courses do not exist # verify about and courses do not exist
retrieved_doc = retrieved_docs.get(about_id) retrieved_doc = retrieved_docs.get(about_id)
assert not retrieved_doc assert not retrieved_doc