mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-17 08:12:28 +02:00
replace usp
This commit is contained in:
parent
a3b2941747
commit
fe31324337
@ -1,39 +1,78 @@
|
||||
from datetime import datetime
|
||||
from urllib import robotparser
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List
|
||||
from typing import Set
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from usp.tree import sitemap_tree_for_homepage # type: ignore
|
||||
import requests
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
|
||||
if not rp:
|
||||
return True
|
||||
else:
|
||||
return rp.can_fetch("*", url)
|
||||
|
||||
|
||||
def init_robots_txt(site: str) -> robotparser.RobotFileParser:
|
||||
ts = datetime.now().timestamp()
|
||||
robots_url = f"{site}/robots.txt?ts={ts}"
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robots_url)
|
||||
rp.read()
|
||||
return rp
|
||||
|
||||
|
||||
def list_pages_for_site(site: str) -> list[str]:
|
||||
rp: robotparser.RobotFileParser | None = None
|
||||
def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]:
|
||||
"""Extract sitemap URLs from robots.txt"""
|
||||
sitemap_urls = set()
|
||||
try:
|
||||
rp = init_robots_txt(site)
|
||||
except Exception:
|
||||
logger.warning("Failed to load robots.txt")
|
||||
robots_url = urljoin(base_url, "/robots.txt")
|
||||
resp = requests.get(robots_url, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
for line in resp.text.splitlines():
|
||||
if line.lower().startswith("sitemap:"):
|
||||
sitemap_url = line.split(":", 1)[1].strip()
|
||||
sitemap_urls.add(sitemap_url)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching robots.txt: {e}")
|
||||
return sitemap_urls
|
||||
|
||||
tree = sitemap_tree_for_homepage(site)
|
||||
|
||||
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
|
||||
pages = list(dict.fromkeys(pages))
|
||||
def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]:
|
||||
"""Extract URLs from a sitemap XML file"""
|
||||
urls = set()
|
||||
try:
|
||||
resp = requests.get(sitemap_url, timeout=10)
|
||||
if resp.status_code != 200:
|
||||
return urls
|
||||
|
||||
return pages
|
||||
root = ET.fromstring(resp.content)
|
||||
|
||||
# Handle both regular sitemaps and sitemap indexes
|
||||
# Remove namespace for easier parsing
|
||||
namespace = re.match(r"\{.*\}", root.tag)
|
||||
ns = namespace.group(0) if namespace else ""
|
||||
|
||||
if root.tag == f"{ns}sitemapindex":
|
||||
# This is a sitemap index
|
||||
for sitemap in root.findall(f".//{ns}loc"):
|
||||
sub_urls = _extract_urls_from_sitemap(sitemap.text)
|
||||
urls.update(sub_urls)
|
||||
else:
|
||||
# This is a regular sitemap
|
||||
for url in root.findall(f".//{ns}loc"):
|
||||
if url.text:
|
||||
urls.add(url.text)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing sitemap {sitemap_url}: {e}")
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def list_pages_for_site(site: str) -> List[str]:
|
||||
"""Get list of pages from a site's sitemaps"""
|
||||
site = site.rstrip("/")
|
||||
all_urls = set()
|
||||
|
||||
# Try both common sitemap locations
|
||||
sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"]
|
||||
for path in sitemap_paths:
|
||||
sitemap_url = urljoin(site, path)
|
||||
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
|
||||
|
||||
# Check robots.txt for additional sitemaps
|
||||
sitemap_locations = _get_sitemap_locations_from_robots(site)
|
||||
for sitemap_url in sitemap_locations:
|
||||
all_urls.update(_extract_urls_from_sitemap(sitemap_url))
|
||||
|
||||
return list(all_urls)
|
||||
|
@ -78,7 +78,6 @@ asana==5.0.8
|
||||
zenpy==2.0.41
|
||||
dropbox==11.36.2
|
||||
boto3-stubs[s3]==1.34.133
|
||||
ultimate_sitemap_parser==0.5
|
||||
stripe==10.12.0
|
||||
urllib3==2.2.3
|
||||
mistune==0.8.4
|
||||
|
Loading…
x
Reference in New Issue
Block a user