replace usp

2025-04-17 08:12:28 +02:00 · 2024-10-26 13:35:02 -07:00 · 2024-10-26 13:35:02 -07:00 · fe31324337
commit fe31324337
parent a3b2941747
2 changed files with 67 additions and 29 deletions
--- a/backend/danswer/utils/sitemap.py
+++ b/backend/danswer/utils/sitemap.py
@ -1,39 +1,78 @@
-from datetime import datetime
-from urllib import robotparser
+import re
+import xml.etree.ElementTree as ET
+from typing import List
+from typing import Set
+from urllib.parse import urljoin

-from usp.tree import sitemap_tree_for_homepage  # type: ignore
+import requests

 from danswer.utils.logger import setup_logger

 logger = setup_logger()


-def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
-    if not rp:
-        return True
-    else:
-        return rp.can_fetch("*", url)
-
-
-def init_robots_txt(site: str) -> robotparser.RobotFileParser:
-    ts = datetime.now().timestamp()
-    robots_url = f"{site}/robots.txt?ts={ts}"
-    rp = robotparser.RobotFileParser()
-    rp.set_url(robots_url)
-    rp.read()
-    return rp
-
-
-def list_pages_for_site(site: str) -> list[str]:
-    rp: robotparser.RobotFileParser | None = None
+def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
+    """Extract sitemap URLs from robots.txt"""
+    sitemap_urls = set()
    try:
-        rp = init_robots_txt(site)
-    except Exception:
-        logger.warning("Failed to load robots.txt")
+        robots_url = urljoin(base_url, "/robots.txt")
+        resp = requests.get(robots_url, timeout=10)
+        if resp.status_code == 200:
+            for line in resp.text.splitlines():
+                if line.lower().startswith("sitemap:"):
+                    sitemap_url = line.split(":", 1)[1].strip()
+                    sitemap_urls.add(sitemap_url)
+    except Exception as e:
+        logger.warning(f"Error fetching robots.txt: {e}")
+    return sitemap_urls

-    tree = sitemap_tree_for_homepage(site)

-    pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
-    pages = list(dict.fromkeys(pages))
+def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
+    """Extract URLs from a sitemap XML file"""
+    urls = set()
+    try:
+        resp = requests.get(sitemap_url, timeout=10)
+        if resp.status_code != 200:
+            return urls

-    return pages
+        root = ET.fromstring(resp.content)
+
+        # Handle both regular sitemaps and sitemap indexes
+        # Remove namespace for easier parsing
+        namespace = re.match(r"\{.*\}", root.tag)
+        ns = namespace.group(0) if namespace else ""
+
+        if root.tag == f"{ns}sitemapindex":
+            # This is a sitemap index
+            for sitemap in root.findall(f".//{ns}loc"):
+                sub_urls = _extract_urls_from_sitemap(sitemap.text)
+                urls.update(sub_urls)
+        else:
+            # This is a regular sitemap
+            for url in root.findall(f".//{ns}loc"):
+                if url.text:
+                    urls.add(url.text)
+
+    except Exception as e:
+        logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
+
+    return urls
+
+
+def list_pages_for_site(site: str) -> List[str]:
+    """Get list of pages from a site's sitemaps"""
+    site = site.rstrip("/")
+    all_urls = set()
+
+    # Try both common sitemap locations
+    sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
+    for path in sitemap_paths:
+        sitemap_url = urljoin(site, path)
+        all_urls.update(_extract_urls_from_sitemap(sitemap_url))
+
+    # Check robots.txt for additional sitemaps
+    sitemap_locations = _get_sitemap_locations_from_robots(site)
+    for sitemap_url in sitemap_locations:
+        all_urls.update(_extract_urls_from_sitemap(sitemap_url))
+
+    return list(all_urls)
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@ -78,7 +78,6 @@ asana==5.0.8
 zenpy==2.0.41
 dropbox==11.36.2
 boto3-stubs[s3]==1.34.133
-ultimate_sitemap_parser==0.5
 stripe==10.12.0
 urllib3==2.2.3
 mistune==0.8.4