mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-02 17:38:04 +02:00
Fix Sitemap Robo (#1826)
This commit is contained in:
@@ -1,35 +1,39 @@
|
||||
from urllib import robotparser
|
||||
from usp.tree import sitemap_tree_for_homepage
|
||||
from datetime import datetime
|
||||
from urllib import robotparser
|
||||
|
||||
from usp.tree import sitemap_tree_for_homepage # type: ignore
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
def test_url(rp, url):
|
||||
|
||||
def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool:
|
||||
if not rp:
|
||||
return True
|
||||
else:
|
||||
return rp.can_fetch("*", url)
|
||||
|
||||
def init_robots_txt(site):
|
||||
|
||||
def init_robots_txt(site: str) -> robotparser.RobotFileParser:
|
||||
ts = datetime.now().timestamp()
|
||||
robots_url = f"{url}/robots.txt?ts={ts}"
|
||||
robots_url = f"{site}/robots.txt?ts={ts}"
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robots_url)
|
||||
rp.read()
|
||||
return rp
|
||||
|
||||
def list_pages_for_site(site):
|
||||
rp = None
|
||||
try:
|
||||
rp = init_robots_txt(site)
|
||||
except:
|
||||
logger.warning("Failed to load robots.txt")
|
||||
|
||||
tree = sitemap_tree_for_homepage(site)
|
||||
def list_pages_for_site(site: str) -> list[str]:
|
||||
rp: robotparser.RobotFileParser | None = None
|
||||
try:
|
||||
rp = init_robots_txt(site)
|
||||
except Exception:
|
||||
logger.warning("Failed to load robots.txt")
|
||||
|
||||
pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
|
||||
pages = list(dict.fromkeys(pages))
|
||||
tree = sitemap_tree_for_homepage(site)
|
||||
|
||||
return(pages)
|
||||
pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)]
|
||||
pages = list(dict.fromkeys(pages))
|
||||
|
||||
return pages
|
||||
|
Reference in New Issue
Block a user