Fix Sitemap Robo (#1826)

This commit is contained in:
Yuhong Sun
2024-07-14 13:29:26 -07:00
committed by GitHub
parent 1b311d092e
commit 56b175f597

View File

@@ -1,35 +1,39 @@
from urllib import robotparser
from usp.tree import sitemap_tree_for_homepage
from datetime import datetime
from urllib import robotparser
from usp.tree import sitemap_tree_for_homepage # type: ignore
from danswer.utils.logger import setup_logger
logger = setup_logger()
def test_url(rp, url):
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
if not rp:
return True
else:
return rp.can_fetch("*", url)
def init_robots_txt(site):
def init_robots_txt(site: str) -> robotparser.RobotFileParser:
ts = datetime.now().timestamp()
robots_url = f"{url}/robots.txt?ts={ts}"
robots_url = f"{site}/robots.txt?ts={ts}"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
def list_pages_for_site(site):
rp = None
try:
rp = init_robots_txt(site)
except:
logger.warning("Failed to load robots.txt")
tree = sitemap_tree_for_homepage(site)
def list_pages_for_site(site: str) -> list[str]:
rp: robotparser.RobotFileParser | None = None
try:
rp = init_robots_txt(site)
except Exception:
logger.warning("Failed to load robots.txt")
pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
pages = list(dict.fromkeys(pages))
tree = sitemap_tree_for_homepage(site)
return(pages)
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
pages = list(dict.fromkeys(pages))
return pages