mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-30 09:40:35 +02:00
fix web connector tests to handle new deduping (#4175)
Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
This commit is contained in:
parent
a52d0d29be
commit
9bb8cdfff1
@ -320,7 +320,8 @@ class WebConnector(LoadConnector):
|
|||||||
logger.warning(last_error)
|
logger.warning(last_error)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.info(f"{len(visited_links)}: Visiting {initial_url}")
|
index = len(visited_links)
|
||||||
|
logger.info(f"{index}: Visiting {initial_url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
check_internet_connection(initial_url)
|
check_internet_connection(initial_url)
|
||||||
@ -371,12 +372,10 @@ class WebConnector(LoadConnector):
|
|||||||
initial_url = final_url
|
initial_url = final_url
|
||||||
if initial_url in visited_links:
|
if initial_url in visited_links:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed"
|
f"{index}: {initial_url} redirected to {final_url} - already indexed"
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
logger.info(
|
logger.info(f"{index}: {initial_url} redirected to {final_url}")
|
||||||
f"{len(visited_links)}: {initial_url} redirected to {final_url}"
|
|
||||||
)
|
|
||||||
visited_links.add(initial_url)
|
visited_links.add(initial_url)
|
||||||
|
|
||||||
if self.scroll_before_scraping:
|
if self.scroll_before_scraping:
|
||||||
@ -410,7 +409,9 @@ class WebConnector(LoadConnector):
|
|||||||
"""For websites containing iframes that need to be scraped,
|
"""For websites containing iframes that need to be scraped,
|
||||||
the code below can extract text from within these iframes.
|
the code below can extract text from within these iframes.
|
||||||
"""
|
"""
|
||||||
logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}")
|
logger.debug(
|
||||||
|
f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
|
||||||
|
)
|
||||||
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
||||||
iframe_count = page.frame_locator("iframe").locator("html").count()
|
iframe_count = page.frame_locator("iframe").locator("html").count()
|
||||||
if iframe_count > 0:
|
if iframe_count > 0:
|
||||||
@ -427,11 +428,13 @@ class WebConnector(LoadConnector):
|
|||||||
else:
|
else:
|
||||||
parsed_html.cleaned_text += "\n" + document_text
|
parsed_html.cleaned_text += "\n" + document_text
|
||||||
|
|
||||||
# Sometimes pages with #! will server duplicate content
|
# Sometimes pages with #! will serve duplicate content
|
||||||
# There are also just other ways this can happen
|
# There are also just other ways this can happen
|
||||||
hashed_text = hash(parsed_html.cleaned_text)
|
hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
|
||||||
if hashed_text in content_hashes:
|
if hashed_text in content_hashes:
|
||||||
logger.info(f"Skipping duplicate content for {initial_url}")
|
logger.info(
|
||||||
|
f"{index}: Skipping duplicate title + content for {initial_url}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
content_hashes.add(hashed_text)
|
content_hashes.add(hashed_text)
|
||||||
|
|
||||||
|
@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
|
|||||||
selected_cc_pair = CCPairManager.get_indexing_status_by_id(
|
selected_cc_pair = CCPairManager.get_indexing_status_by_id(
|
||||||
cc_pair_1.id, user_performing_action=admin_user
|
cc_pair_1.id, user_performing_action=admin_user
|
||||||
)
|
)
|
||||||
|
|
||||||
assert selected_cc_pair is not None, "cc_pair not found after indexing!"
|
assert selected_cc_pair is not None, "cc_pair not found after indexing!"
|
||||||
assert selected_cc_pair.docs_indexed == 15
|
|
||||||
|
# used to be 15, but now
|
||||||
|
# localhost:8889/ and localhost:8889/index.html are deduped
|
||||||
|
assert selected_cc_pair.docs_indexed == 14
|
||||||
|
|
||||||
logger.info("Removing about.html.")
|
logger.info("Removing about.html.")
|
||||||
os.remove(os.path.join(website_tgt, "about.html"))
|
os.remove(os.path.join(website_tgt, "about.html"))
|
||||||
@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
|
|||||||
cc_pair_1.id, user_performing_action=admin_user
|
cc_pair_1.id, user_performing_action=admin_user
|
||||||
)
|
)
|
||||||
assert selected_cc_pair is not None, "cc_pair not found after pruning!"
|
assert selected_cc_pair is not None, "cc_pair not found after pruning!"
|
||||||
assert selected_cc_pair.docs_indexed == 13
|
assert selected_cc_pair.docs_indexed == 12
|
||||||
|
|
||||||
# check vespa
|
# check vespa
|
||||||
|
root_id = f"http://{hostname}:{port}/"
|
||||||
index_id = f"http://{hostname}:{port}/index.html"
|
index_id = f"http://{hostname}:{port}/index.html"
|
||||||
about_id = f"http://{hostname}:{port}/about.html"
|
about_id = f"http://{hostname}:{port}/about.html"
|
||||||
courses_id = f"http://{hostname}:{port}/courses.html"
|
courses_id = f"http://{hostname}:{port}/courses.html"
|
||||||
|
|
||||||
doc_ids = [index_id, about_id, courses_id]
|
doc_ids = [root_id, index_id, about_id, courses_id]
|
||||||
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
|
retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
|
||||||
retrieved_docs = {
|
retrieved_docs = {
|
||||||
doc["fields"]["document_id"]: doc["fields"]
|
doc["fields"]["document_id"]: doc["fields"]
|
||||||
for doc in retrieved_docs_dict
|
for doc in retrieved_docs_dict
|
||||||
}
|
}
|
||||||
|
|
||||||
# verify index.html exists in Vespa
|
# verify root exists in Vespa
|
||||||
retrieved_doc = retrieved_docs.get(index_id)
|
retrieved_doc = retrieved_docs.get(root_id)
|
||||||
assert retrieved_doc
|
assert retrieved_doc
|
||||||
|
|
||||||
|
# verify index.html does not exist in Vespa since it is a duplicate of root
|
||||||
|
retrieved_doc = retrieved_docs.get(index_id)
|
||||||
|
assert not retrieved_doc
|
||||||
|
|
||||||
# verify about and courses do not exist
|
# verify about and courses do not exist
|
||||||
retrieved_doc = retrieved_docs.get(about_id)
|
retrieved_doc = retrieved_docs.get(about_id)
|
||||||
assert not retrieved_doc
|
assert not retrieved_doc
|
||||||
|
Loading…
x
Reference in New Issue
Block a user