fix web connector tests to handle new deduping (#4175)

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
2025-05-30 09:40:35 +02:00 · 2025-03-03 12:54:20 -08:00 · 2025-03-03 12:54:20 -08:00 · 9bb8cdfff1
commit 9bb8cdfff1
parent a52d0d29be
2 changed files with 26 additions and 14 deletions
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@ -320,7 +320,8 @@ class WebConnector(LoadConnector):
                logger.warning(last_error)
                continue
-            logger.info(f"{len(visited_links)}: Visiting {initial_url}")
+            index = len(visited_links)
            logger.info(f"{index}: Visiting {initial_url}")
            try:
                check_internet_connection(initial_url)
@ -371,12 +372,10 @@ class WebConnector(LoadConnector):
                    initial_url = final_url
                    if initial_url in visited_links:
                        logger.info(
-                            f"{len(visited_links)}: {initial_url} redirected to {final_url} - already indexed"
+                            f"{index}: {initial_url} redirected to {final_url} - already indexed"
                        )
                        continue
-                    logger.info(
+                    logger.info(f"{index}: {initial_url} redirected to {final_url}")
                        f"{len(visited_links)}: {initial_url} redirected to {final_url}"
                    )
                    visited_links.add(initial_url)
                if self.scroll_before_scraping:
@ -410,7 +409,9 @@ class WebConnector(LoadConnector):
                """For websites containing iframes that need to be scraped,
                the code below can extract text from within these iframes.
                """
-                logger.info(f"Length of cleaned text {len(parsed_html.cleaned_text)}")
+                logger.debug(
                    f"{index}: Length of cleaned text {len(parsed_html.cleaned_text)}"
                )
                if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
                    iframe_count = page.frame_locator("iframe").locator("html").count()
                    if iframe_count > 0:
@ -427,11 +428,13 @@ class WebConnector(LoadConnector):
                        else:
                            parsed_html.cleaned_text += "\n" + document_text
-                # Sometimes pages with #! will server duplicate content
+                # Sometimes pages with #! will serve duplicate content
                # There are also just other ways this can happen
-                hashed_text = hash(parsed_html.cleaned_text)
+                hashed_text = hash((parsed_html.title, parsed_html.cleaned_text))
                if hashed_text in content_hashes:
-                    logger.info(f"Skipping duplicate content for {initial_url}")
+                    logger.info(
                        f"{index}: Skipping duplicate title + content for {initial_url}"
                    )
                    continue
                content_hashes.add(hashed_text)
--- a/backend/tests/integration/tests/pruning/test_pruning.py
+++ b/backend/tests/integration/tests/pruning/test_pruning.py
@ -142,8 +142,12 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
            selected_cc_pair = CCPairManager.get_indexing_status_by_id(
                cc_pair_1.id, user_performing_action=admin_user
            )
            assert selected_cc_pair is not None, "cc_pair not found after indexing!"
-            assert selected_cc_pair.docs_indexed == 15
+
            # used to be 15, but now
            # localhost:8889/ and localhost:8889/index.html are deduped
            assert selected_cc_pair.docs_indexed == 14
            logger.info("Removing about.html.")
            os.remove(os.path.join(website_tgt, "about.html"))
@ -160,24 +164,29 @@ def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None:
                cc_pair_1.id, user_performing_action=admin_user
            )
            assert selected_cc_pair is not None, "cc_pair not found after pruning!"
-            assert selected_cc_pair.docs_indexed == 13
+            assert selected_cc_pair.docs_indexed == 12
            # check vespa
            root_id = f"http://{hostname}:{port}/"
            index_id = f"http://{hostname}:{port}/index.html"
            about_id = f"http://{hostname}:{port}/about.html"
            courses_id = f"http://{hostname}:{port}/courses.html"
-            doc_ids = [index_id, about_id, courses_id]
+            doc_ids = [root_id, index_id, about_id, courses_id]
            retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"]
            retrieved_docs = {
                doc["fields"]["document_id"]: doc["fields"]
                for doc in retrieved_docs_dict
            }
-            # verify index.html exists in Vespa
+            # verify root exists in Vespa
-            retrieved_doc = retrieved_docs.get(index_id)
+            retrieved_doc = retrieved_docs.get(root_id)
            assert retrieved_doc
            # verify index.html does not exist in Vespa since it is a duplicate of root
            retrieved_doc = retrieved_docs.get(index_id)
            assert not retrieved_doc
            # verify about and courses do not exist
            retrieved_doc = retrieved_docs.get(about_id)
            assert not retrieved_doc