mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-18 05:41:58 +01:00
79 lines
2.5 KiB
Python
79 lines
2.5 KiB
Python
import re
|
|
import xml.etree.ElementTree as ET
|
|
from typing import Set
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
|
|
"""Extract sitemap URLs from robots.txt"""
|
|
sitemap_urls: set = set()
|
|
try:
|
|
robots_url = urljoin(base_url, "/robots.txt")
|
|
resp = requests.get(robots_url, timeout=10)
|
|
if resp.status_code == 200:
|
|
for line in resp.text.splitlines():
|
|
if line.lower().startswith("sitemap:"):
|
|
sitemap_url = line.split(":", 1)[1].strip()
|
|
sitemap_urls.add(sitemap_url)
|
|
except Exception as e:
|
|
logger.warning(f"Error fetching robots.txt: {e}")
|
|
return sitemap_urls
|
|
|
|
|
|
def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
|
|
"""Extract URLs from a sitemap XML file"""
|
|
urls: set[str] = set()
|
|
try:
|
|
resp = requests.get(sitemap_url, timeout=10)
|
|
if resp.status_code != 200:
|
|
return urls
|
|
|
|
root = ET.fromstring(resp.content)
|
|
|
|
# Handle both regular sitemaps and sitemap indexes
|
|
# Remove namespace for easier parsing
|
|
namespace = re.match(r"\{.*\}", root.tag)
|
|
ns = namespace.group(0) if namespace else ""
|
|
|
|
if root.tag == f"{ns}sitemapindex":
|
|
# This is a sitemap index
|
|
for sitemap in root.findall(f".//{ns}loc"):
|
|
if sitemap.text:
|
|
sub_urls = _extract_urls_from_sitemap(sitemap.text)
|
|
urls.update(sub_urls)
|
|
else:
|
|
# This is a regular sitemap
|
|
for url in root.findall(f".//{ns}loc"):
|
|
if url.text:
|
|
urls.add(url.text)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
|
|
|
|
return urls
|
|
|
|
|
|
def list_pages_for_site(site: str) -> list[str]:
|
|
"""Get list of pages from a site's sitemaps"""
|
|
site = site.rstrip("/")
|
|
all_urls = set()
|
|
|
|
# Try both common sitemap locations
|
|
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
|
|
for path in sitemap_paths:
|
|
sitemap_url = urljoin(site, path)
|
|
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
|
|
|
|
# Check robots.txt for additional sitemaps
|
|
sitemap_locations = _get_sitemap_locations_from_robots(site)
|
|
for sitemap_url in sitemap_locations:
|
|
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
|
|
|
|
return list(all_urls)
|