diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 2f7f656ce..9380791db 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -320,7 +320,8 @@ class WebConnector(LoadConnector): logger.warning(last_error) continue - logger.info(f"{len(visited_links)}: Visiting {initial_url}") + index = len(visited_links) + logger.info(f"{index}: Visiting {initial_url}") try: check_internet_connection(initial_url) @@ -371,12 +372,10 @@ class WebConnector(LoadConnector): initial_url = final_url if initial_url in visited_links: logger.info( - f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed" + f"{index}: {initial_url} redirected to {final_url} - already indexed" ) continue - logger.info( - f"{len(visited_links)}: {initial_url} redirected to {final_url}" - ) + logger.info(f"{index}: {initial_url} redirected to {final_url}") visited_links.add(initial_url) if self.scroll_before_scraping: @@ -410,7 +409,9 @@ class WebConnector(LoadConnector): """For websites containing iframes that need to be scraped, the code below can extract text from within these iframes. """ - logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}") + logger.debug( + f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}" + ) if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: iframe_count = page.frame_locator("iframe").locator("html").count() if iframe_count > 0: @@ -427,11 +428,13 @@ class WebConnector(LoadConnector): else: parsed_html.cleaned_text += "\n" + document_text - # Sometimes pages with #! will server duplicate content + # Sometimes pages with #! will serve duplicate content # There are also just other ways this can happen - hashed_text = hash(parsed_html.cleaned_text) + hashed_text = hash((parsed_html.title, parsed_html.cleaned_text)) if hashed_text in content_hashes: - logger.info(f"Skipping duplicate content for {initial_url}") + logger.info( + f"{index}: Skipping duplicate title + content for {initial_url}" + ) continue content_hashes.add(hashed_text) diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py index 96153db94..997c84cad 100644 --- a/backend/tests/integration/tests/pruning/test_pruning.py +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None: selected_cc_pair = CCPairManager.get_indexing_status_by_id( cc_pair_1.id, user_performing_action=admin_user ) + assert selected_cc_pair is not None, "cc_pair not found after indexing!" - assert selected_cc_pair.docs_indexed == 15 + + # used to be 15, but now + # localhost:8889/ and localhost:8889/index.html are deduped + assert selected_cc_pair.docs_indexed == 14 logger.info("Removing about.html.") os.remove(os.path.join(website_tgt, "about.html")) @@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None: cc_pair_1.id, user_performing_action=admin_user ) assert selected_cc_pair is not None, "cc_pair not found after pruning!" - assert selected_cc_pair.docs_indexed == 13 + assert selected_cc_pair.docs_indexed == 12 # check vespa + root_id = f"http://{hostname}:{port}/" index_id = f"http://{hostname}:{port}/index.html" about_id = f"http://{hostname}:{port}/about.html" courses_id = f"http://{hostname}:{port}/courses.html" - doc_ids = [index_id, about_id, courses_id] + doc_ids = [root_id, index_id, about_id, courses_id] retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] retrieved_docs = { doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict } - # verify index.html exists in Vespa - retrieved_doc = retrieved_docs.get(index_id) + # verify root exists in Vespa + retrieved_doc = retrieved_docs.get(root_id) assert retrieved_doc + # verify index.html does not exist in Vespa since it is a duplicate of root + retrieved_doc = retrieved_docs.get(index_id) + assert not retrieved_doc + # verify about and courses do not exist retrieved_doc = retrieved_docs.get(about_id) assert not retrieved_doc