Try to find the sitemap for a given site (#1538)

2025-10-10 13:15:18 +02:00 · 2024-07-14 23:24:10 +03:00
parent 6ee1292757
commit 1b311d092e
4 changed files with 47 additions and 5 deletions
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -29,6 +29,7 @@ from danswer.connectors.models import Section
 from danswer.file_processing.extract_file_text import pdf_to_text
 from danswer.file_processing.html_utils import web_html_cleanup
 from danswer.utils.logger import setup_logger
 from danswer.utils.sitemap import list_pages_for_site
 logger = setup_logger()
@@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")
-    result = [
+    urls = [
        _ensure_absolute_url(sitemap_url, loc_tag.text)
        for loc_tag in soup.find_all("loc")
    ]
-    if not result:
+
    if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
        # the given url doesn't look like a sitemap, let's try to find one
        urls = list_pages_for_site(sitemap_url)
    if len(urls) == 0:
        raise ValueError(
            f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
        )
-    return result
+    return urls
 def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
--- a/backend/danswer/utils/sitemap.py
+++ b/backend/danswer/utils/sitemap.py
@@ -0,0 +1,35 @@
 from urllib import robotparser
 from usp.tree import sitemap_tree_for_homepage
 from datetime import datetime
 from danswer.utils.logger import setup_logger
 logger = setup_logger()
 def test_url(rp, url):
    if not rp:
        return True
    else:
        return rp.can_fetch("*", url)
 def init_robots_txt(site):
    ts = datetime.now().timestamp()
    robots_url = f"{url}/robots.txt?ts={ts}"
    rp = robotparser.RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp
 def list_pages_for_site(site):
  rp = None
  try:
    rp = init_robots_txt(site)
  except:
    logger.warning("Failed to load robots.txt")
  tree = sitemap_tree_for_homepage(site)
  pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
  pages = list(dict.fromkeys(pages))
  return(pages)
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -73,3 +73,4 @@ hubspot-api-client==8.1.0
 zenpy==2.0.41
 dropbox==11.36.2
 boto3-stubs[s3]==1.34.133
 ultimate_sitemap_parser==0.5
--- a/web/src/app/admin/connectors/web/page.tsx
+++ b/web/src/app/admin/connectors/web/page.tsx
@@ -98,7 +98,7 @@ export default function Web() {
                      name: "Sitemap",
                      value: "sitemap",
                      description:
-                        "Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.",
+                        "Enter the sitemap url or the root of the site which we can scan for a sitemap",
                    },
                  ]}
                />