danswer/backend/onyx/utils/sitemap.py

import re
import xml.etree.ElementTree as ET
from typing import Set
from urllib.parse import urljoin

import requests

from onyx.utils.logger import setup_logger

logger = setup_logger()


def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
    """Extract sitemap URLs from robots.txt"""
    sitemap_urls: set = set()
    try:
        robots_url = urljoin(base_url, "/robots.txt")
        resp = requests.get(robots_url, timeout=10)
        if resp.status_code == 200:
            for line in resp.text.splitlines():
                if line.lower().startswith("sitemap:"):
                    sitemap_url = line.split(":", 1)[1].strip()
                    sitemap_urls.add(sitemap_url)
    except Exception as e:
        logger.warning(f"Error fetching robots.txt: {e}")
    return sitemap_urls


def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
    """Extract URLs from a sitemap XML file"""
    urls: set[str] = set()
    try:
        resp = requests.get(sitemap_url, timeout=10)
        if resp.status_code != 200:
            return urls

        root = ET.fromstring(resp.content)

        # Handle both regular sitemaps and sitemap indexes
        # Remove namespace for easier parsing
        namespace = re.match(r"\{.*\}", root.tag)
        ns = namespace.group(0) if namespace else ""

        if root.tag == f"{ns}sitemapindex":
            # This is a sitemap index
            for sitemap in root.findall(f".//{ns}loc"):
                if sitemap.text:
                    sub_urls = _extract_urls_from_sitemap(sitemap.text)
                    urls.update(sub_urls)
        else:
            # This is a regular sitemap
            for url in root.findall(f".//{ns}loc"):
                if url.text:
                    urls.add(url.text)

    except Exception as e:
        logger.warning(f"Error processing sitemap {sitemap_url}: {e}")

    return urls


def list_pages_for_site(site: str) -> list[str]:
    """Get list of pages from a site's sitemaps"""
    site = site.rstrip("/")
    all_urls = set()

    # Try both common sitemap locations
    sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
    for path in sitemap_paths:
        sitemap_url = urljoin(site, path)
        all_urls.update(_extract_urls_from_sitemap(sitemap_url))

    # Check robots.txt for additional sitemaps
    sitemap_locations = _get_sitemap_locations_from_robots(site)
    for sitemap_url in sitemap_locations:
        all_urls.update(_extract_urls_from_sitemap(sitemap_url))

    return list(all_urls)