DAN-139 web connector recursion (#126)

Now handled by checking final page after redirects
2025-09-19 03:58:30 +02:00 · 2023-07-02 19:01:13 -07:00
parent 24e61a646d
commit 381b3719c9
1 changed files with 16 additions and 8 deletions
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -30,8 +30,8 @@ def is_valid_url(url: str) -> bool:
 def get_internal_links(
    base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
-) -> list[str]:
+) -> set[str]:
-    internal_links = []
+    internal_links = set()
    for link in cast(list[dict[str, Any]], soup.find_all("a")):
        href = cast(str | None, link.get("href"))
        if not href:
@@ -47,7 +47,7 @@ def get_internal_links(
            href = urljoin(url, href)
        if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
-            internal_links.append(href)
+            internal_links.add(href)
    return internal_links
@@ -74,11 +74,6 @@ class WebConnector(LoadConnector):
        to_visit: list[str] = [self.base_url]
        doc_batch: list[Document] = []
        # Edge case handling user provides HTML without terminating slash (prevents duplicate)
        # Most sites either redirect no slash to slash or serve same content
        if self.base_url[-1] != "/":
            visited_links.add(self.base_url + "/")
        restart_playwright = True
        while to_visit:
            current_url = to_visit.pop()
@@ -86,6 +81,8 @@ class WebConnector(LoadConnector):
                continue
            visited_links.add(current_url)
            logger.info(f"Indexing {current_url}")
            try:
                if restart_playwright:
                    playwright = sync_playwright().start()
@@ -114,6 +111,15 @@ class WebConnector(LoadConnector):
                page = context.new_page()
                page.goto(current_url)
                final_page = page.url
                if final_page != current_url:
                    logger.info(f"Redirected to {final_page}")
                    current_url = final_page
                    if current_url in visited_links:
                        logger.info(f"Redirected page already indexed")
                        continue
                    visited_links.add(current_url)
                content = page.content()
                soup = BeautifulSoup(content, "html.parser")
@@ -161,6 +167,8 @@ class WebConnector(LoadConnector):
                page.close()
            except Exception as e:
                logger.error(f"Failed to fetch '{current_url}': {e}")
                playwright.stop()
                restart_playwright = True
                continue
            if len(doc_batch) >= self.batch_size: