Try to find the sitemap for a given site (#1538)

This commit is contained in:
Zoltan Szabo
2024-07-14 23:24:10 +03:00
committed by GitHub
parent 6ee1292757
commit 1b311d092e
4 changed files with 47 additions and 5 deletions

View File

@@ -29,6 +29,7 @@ from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.html_utils import web_html_cleanup
from danswer.utils.logger import setup_logger
from danswer.utils.sitemap import list_pages_for_site
logger = setup_logger()
@@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
result = [
urls = [
_ensure_absolute_url(sitemap_url, loc_tag.text)
for loc_tag in soup.find_all("loc")
]
if not result:
if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
# the given url doesn't look like a sitemap, let's try to find one
urls = list_pages_for_site(sitemap_url)
if len(urls) == 0:
raise ValueError(
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
)
return result
return urls
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:

View File

@@ -0,0 +1,35 @@
from urllib import robotparser
from usp.tree import sitemap_tree_for_homepage
from datetime import datetime
from danswer.utils.logger import setup_logger
logger = setup_logger()
def test_url(rp, url):
if not rp:
return True
else:
return rp.can_fetch("*", url)
def init_robots_txt(site):
ts = datetime.now().timestamp()
robots_url = f"{url}/robots.txt?ts={ts}"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
def list_pages_for_site(site):
rp = None
try:
rp = init_robots_txt(site)
except:
logger.warning("Failed to load robots.txt")
tree = sitemap_tree_for_homepage(site)
pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
pages = list(dict.fromkeys(pages))
return(pages)

View File

@@ -72,4 +72,5 @@ zulip==0.8.2
hubspot-api-client==8.1.0
zenpy==2.0.41
dropbox==11.36.2
boto3-stubs[s3]==1.34.133
boto3-stubs[s3]==1.34.133
ultimate_sitemap_parser==0.5

View File

@@ -98,7 +98,7 @@ export default function Web() {
name: "Sitemap",
value: "sitemap",
description:
"Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.",
"Enter the sitemap url or the root of the site which we can scan for a sitemap",
},
]}
/>