mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-23 12:31:30 +02:00
Try to find the sitemap for a given site (#1538)
This commit is contained in:
@@ -29,6 +29,7 @@ from danswer.connectors.models import Section
|
||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||
from danswer.file_processing.html_utils import web_html_cleanup
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.sitemap import list_pages_for_site
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
result = [
|
||||
urls = [
|
||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||
for loc_tag in soup.find_all("loc")
|
||||
]
|
||||
if not result:
|
||||
|
||||
if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
|
||||
# the given url doesn't look like a sitemap, let's try to find one
|
||||
urls = list_pages_for_site(sitemap_url)
|
||||
|
||||
if len(urls) == 0:
|
||||
raise ValueError(
|
||||
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||
)
|
||||
|
||||
return result
|
||||
return urls
|
||||
|
||||
|
||||
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
||||
|
35
backend/danswer/utils/sitemap.py
Normal file
35
backend/danswer/utils/sitemap.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from urllib import robotparser
|
||||
from usp.tree import sitemap_tree_for_homepage
|
||||
from datetime import datetime
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
def test_url(rp, url):
|
||||
if not rp:
|
||||
return True
|
||||
else:
|
||||
return rp.can_fetch("*", url)
|
||||
|
||||
def init_robots_txt(site):
|
||||
ts = datetime.now().timestamp()
|
||||
robots_url = f"{url}/robots.txt?ts={ts}"
|
||||
rp = robotparser.RobotFileParser()
|
||||
rp.set_url(robots_url)
|
||||
rp.read()
|
||||
return rp
|
||||
|
||||
def list_pages_for_site(site):
|
||||
rp = None
|
||||
try:
|
||||
rp = init_robots_txt(site)
|
||||
except:
|
||||
logger.warning("Failed to load robots.txt")
|
||||
|
||||
tree = sitemap_tree_for_homepage(site)
|
||||
|
||||
pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
|
||||
pages = list(dict.fromkeys(pages))
|
||||
|
||||
return(pages)
|
||||
|
@@ -72,4 +72,5 @@ zulip==0.8.2
|
||||
hubspot-api-client==8.1.0
|
||||
zenpy==2.0.41
|
||||
dropbox==11.36.2
|
||||
boto3-stubs[s3]==1.34.133
|
||||
boto3-stubs[s3]==1.34.133
|
||||
ultimate_sitemap_parser==0.5
|
||||
|
@@ -98,7 +98,7 @@ export default function Web() {
|
||||
name: "Sitemap",
|
||||
value: "sitemap",
|
||||
description:
|
||||
"Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.",
|
||||
"Enter the sitemap url or the root of the site which we can scan for a sitemap",
|
||||
},
|
||||
]}
|
||||
/>
|
||||
|
Reference in New Issue
Block a user