Try to find the sitemap for a given site (#1538)

2025-09-23 12:31:30 +02:00 · 2024-07-14 23:24:10 +03:00
parent 6ee1292757
commit 1b311d092e
4 changed files with 47 additions and 5 deletions
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -29,6 +29,7 @@ from danswer.connectors.models import Section
 from danswer.file_processing.extract_file_text import pdf_to_text
 from danswer.file_processing.html_utils import web_html_cleanup
 from danswer.utils.logger import setup_logger
+from danswer.utils.sitemap import list_pages_for_site

 logger = setup_logger()

@@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")
-    result = [
+    urls = [
        _ensure_absolute_url(sitemap_url, loc_tag.text)
        for loc_tag in soup.find_all("loc")
    ]
-    if not result:
+
+    if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
+        # the given url doesn't look like a sitemap, let's try to find one
+        urls = list_pages_for_site(sitemap_url)
+
+    if len(urls) == 0:
        raise ValueError(
            f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
        )

-    return result
+    return urls


 def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
--- a/backend/danswer/utils/sitemap.py
+++ b/backend/danswer/utils/sitemap.py
@@ -0,0 +1,35 @@
+from urllib import robotparser
+from usp.tree import sitemap_tree_for_homepage
+from datetime import datetime
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+def test_url(rp, url):
+    if not rp:
+        return True
+    else:
+        return rp.can_fetch("*", url)
+
+def init_robots_txt(site):
+    ts = datetime.now().timestamp()
+    robots_url = f"{url}/robots.txt?ts={ts}"
+    rp = robotparser.RobotFileParser()
+    rp.set_url(robots_url)
+    rp.read()
+    return rp
+
+def list_pages_for_site(site):
+  rp = None
+  try:
+    rp = init_robots_txt(site)
+  except:
+    logger.warning("Failed to load robots.txt")
+
+  tree = sitemap_tree_for_homepage(site)
+
+  pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
+  pages = list(dict.fromkeys(pages))
+
+  return(pages)
+
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -72,4 +72,5 @@ zulip==0.8.2
 hubspot-api-client==8.1.0
 zenpy==2.0.41
 dropbox==11.36.2
-boto3-stubs[s3]==1.34.133
+boto3-stubs[s3]==1.34.133
+ultimate_sitemap_parser==0.5
--- a/web/src/app/admin/connectors/web/page.tsx
+++ b/web/src/app/admin/connectors/web/page.tsx
@@ -98,7 +98,7 @@ export default function Web() {
                      name: "Sitemap",
                      value: "sitemap",
                      description:
-                        "Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.",
+                        "Enter the sitemap url or the root of the site which we can scan for a sitemap",
                    },
                  ]}
                />