mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
fix web connector tests to handle new deduping (#4175)
Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
parent
a52d0d29be
commit
9bb8cdfff1
@ -320,7 +320,8 @@ class WebConnector(LoadConnector):
|
||||
logger.warning(last_error)
|
||||
continue
|
||||
|
||||
logger.info(f"{len(visited_links)}: Visiting {initial_url}")
|
||||
index = len(visited_links)
|
||||
logger.info(f"{index}: Visiting {initial_url}")
|
||||
|
||||
try:
|
||||
check_internet_connection(initial_url)
|
||||
@ -371,12 +372,10 @@ class WebConnector(LoadConnector):
|
||||
initial_url = final_url
|
||||
if initial_url in visited_links:
|
||||
logger.info(
|
||||
f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed"
|
||||
f"{index}: {initial_url} redirected to {final_url} - already indexed"
|
||||
)
|
||||
continue
|
||||
logger.info(
|
||||
f"{len(visited_links)}: {initial_url} redirected to {final_url}"
|
||||
)
|
||||
logger.info(f"{index}: {initial_url} redirected to {final_url}")
|
||||
visited_links.add(initial_url)
|
||||
|
||||
if self.scroll_before_scraping:
|
||||
@ -410,7 +409,9 @@ class WebConnector(LoadConnector):
|
||||
"""For websites containing iframes that need to be scraped,
|
||||
the code below can extract text from within these iframes.
|
||||
"""
|
||||
logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}")
|
||||
logger.debug(
|
||||
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
|
||||
)
|
||||
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
||||
iframe_count = page.frame_locator("iframe").locator("html").count()
|
||||
if iframe_count > 0:
|
||||
@ -427,11 +428,13 @@ class WebConnector(LoadConnector):
|
||||
else:
|
||||
parsed_html.cleaned_text += "\n" + document_text
|
||||
|
||||
# Sometimes pages with #! will server duplicate content
|
||||
# Sometimes pages with #! will serve duplicate content
|
||||
# There are also just other ways this can happen
|
||||
hashed_text = hash(parsed_html.cleaned_text)
|
||||
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
|
||||
if hashed_text in content_hashes:
|
||||
logger.info(f"Skipping duplicate content for {initial_url}")
|
||||
logger.info(
|
||||
f"{index}: Skipping duplicate title + content for {initial_url}"
|
||||
)
|
||||
continue
|
||||
content_hashes.add(hashed_text)
|
||||
|
||||
|
@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
|
||||
selected_cc_pair = CCPairManager.get_indexing_status_by_id(
|
||||
cc_pair_1.id, user_performing_action=admin_user
|
||||
)
|
||||
|
||||
assert selected_cc_pair is not None, "cc_pair not found after indexing!"
|
||||
assert selected_cc_pair.docs_indexed == 15
|
||||
|
||||
# used to be 15, but now
|
||||
# localhost:8889/ and localhost:8889/index.html are deduped
|
||||
assert selected_cc_pair.docs_indexed == 14
|
||||
|
||||
logger.info("Removing about.html.")
|
||||
os.remove(os.path.join(website_tgt, "about.html"))
|
||||
@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
|
||||
cc_pair_1.id, user_performing_action=admin_user
|
||||
)
|
||||
assert selected_cc_pair is not None, "cc_pair not found after pruning!"
|
||||
assert selected_cc_pair.docs_indexed == 13
|
||||
assert selected_cc_pair.docs_indexed == 12
|
||||
|
||||
# check vespa
|
||||
root_id = f"http://{hostname}:{port}/"
|
||||
index_id = f"http://{hostname}:{port}/index.html"
|
||||
about_id = f"http://{hostname}:{port}/about.html"
|
||||
courses_id = f"http://{hostname}:{port}/courses.html"
|
||||
|
||||
doc_ids = [index_id, about_id, courses_id]
|
||||
doc_ids = [root_id, index_id, about_id, courses_id]
|
||||
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
|
||||
retrieved_docs = {
|
||||
doc["fields"]["document_id"]: doc["fields"]
|
||||
for doc in retrieved_docs_dict
|
||||
}
|
||||
|
||||
# verify index.html exists in Vespa
|
||||
retrieved_doc = retrieved_docs.get(index_id)
|
||||
# verify root exists in Vespa
|
||||
retrieved_doc = retrieved_docs.get(root_id)
|
||||
assert retrieved_doc
|
||||
|
||||
# verify index.html does not exist in Vespa since it is a duplicate of root
|
||||
retrieved_doc = retrieved_docs.get(index_id)
|
||||
assert not retrieved_doc
|
||||
|
||||
# verify about and courses do not exist
|
||||
retrieved_doc = retrieved_docs.get(about_id)
|
||||
assert not retrieved_doc
|
||||
|
Loading…
x
Reference in New Issue
Block a user