mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 13:15:18 +02:00
Try to find the sitemap for a given site (#1538)
This commit is contained in:
@@ -29,6 +29,7 @@ from danswer.connectors.models import Section
|
|||||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
from danswer.file_processing.extract_file_text import pdf_to_text
|
||||||
from danswer.file_processing.html_utils import web_html_cleanup
|
from danswer.file_processing.html_utils import web_html_cleanup
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
from danswer.utils.sitemap import list_pages_for_site
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
@@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
result = [
|
urls = [
|
||||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||||
for loc_tag in soup.find_all("loc")
|
for loc_tag in soup.find_all("loc")
|
||||||
]
|
]
|
||||||
if not result:
|
|
||||||
|
if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
|
||||||
|
# the given url doesn't look like a sitemap, let's try to find one
|
||||||
|
urls = list_pages_for_site(sitemap_url)
|
||||||
|
|
||||||
|
if len(urls) == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||||
)
|
)
|
||||||
|
|
||||||
return result
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
||||||
|
35
backend/danswer/utils/sitemap.py
Normal file
35
backend/danswer/utils/sitemap.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from urllib import robotparser
|
||||||
|
from usp.tree import sitemap_tree_for_homepage
|
||||||
|
from datetime import datetime
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
def test_url(rp, url):
|
||||||
|
if not rp:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return rp.can_fetch("*", url)
|
||||||
|
|
||||||
|
def init_robots_txt(site):
|
||||||
|
ts = datetime.now().timestamp()
|
||||||
|
robots_url = f"{url}/robots.txt?ts={ts}"
|
||||||
|
rp = robotparser.RobotFileParser()
|
||||||
|
rp.set_url(robots_url)
|
||||||
|
rp.read()
|
||||||
|
return rp
|
||||||
|
|
||||||
|
def list_pages_for_site(site):
|
||||||
|
rp = None
|
||||||
|
try:
|
||||||
|
rp = init_robots_txt(site)
|
||||||
|
except:
|
||||||
|
logger.warning("Failed to load robots.txt")
|
||||||
|
|
||||||
|
tree = sitemap_tree_for_homepage(site)
|
||||||
|
|
||||||
|
pages = [page.url for page in tree.all_pages() if test_url(rp, page)]
|
||||||
|
pages = list(dict.fromkeys(pages))
|
||||||
|
|
||||||
|
return(pages)
|
||||||
|
|
@@ -73,3 +73,4 @@ hubspot-api-client==8.1.0
|
|||||||
zenpy==2.0.41
|
zenpy==2.0.41
|
||||||
dropbox==11.36.2
|
dropbox==11.36.2
|
||||||
boto3-stubs[s3]==1.34.133
|
boto3-stubs[s3]==1.34.133
|
||||||
|
ultimate_sitemap_parser==0.5
|
||||||
|
@@ -98,7 +98,7 @@ export default function Web() {
|
|||||||
name: "Sitemap",
|
name: "Sitemap",
|
||||||
value: "sitemap",
|
value: "sitemap",
|
||||||
description:
|
description:
|
||||||
"Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.",
|
"Enter the sitemap url or the root of the site which we can scan for a sitemap",
|
||||||
},
|
},
|
||||||
]}
|
]}
|
||||||
/>
|
/>
|
||||||
|
Reference in New Issue
Block a user