diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 140162a7b04b..6c66dc785606 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -29,6 +29,7 @@ from danswer.connectors.models import Section from danswer.file_processing.extract_file_text import pdf_to_text from danswer.file_processing.html_utils import web_html_cleanup from danswer.utils.logger import setup_logger +from danswer.utils.sitemap import list_pages_for_site logger = setup_logger() @@ -145,16 +146,21 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]: response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - result = [ + urls = [ _ensure_absolute_url(sitemap_url, loc_tag.text) for loc_tag in soup.find_all("loc") ] - if not result: + + if len(urls) == 0 and len(soup.find_all("urlset")) == 0: + # the given url doesn't look like a sitemap, let's try to find one + urls = list_pages_for_site(sitemap_url) + + if len(urls) == 0: raise ValueError( f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead." ) - return result + return urls def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str: diff --git a/backend/danswer/utils/sitemap.py b/backend/danswer/utils/sitemap.py new file mode 100644 index 000000000000..3b518688cdee --- /dev/null +++ b/backend/danswer/utils/sitemap.py @@ -0,0 +1,35 @@ +from urllib import robotparser +from usp.tree import sitemap_tree_for_homepage +from datetime import datetime +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +def test_url(rp, url): + if not rp: + return True + else: + return rp.can_fetch("*", url) + +def init_robots_txt(site): + ts = datetime.now().timestamp() + robots_url = f"{url}/robots.txt?ts={ts}" + rp = robotparser.RobotFileParser() + rp.set_url(robots_url) + rp.read() + return rp + +def list_pages_for_site(site): + rp = None + try: + rp = init_robots_txt(site) + except: + logger.warning("Failed to load robots.txt") + + tree = sitemap_tree_for_homepage(site) + + pages = [page.url for page in tree.all_pages() if test_url(rp, page)] + pages = list(dict.fromkeys(pages)) + + return(pages) + diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 76f4387aeb9a..fc36432dfd0f 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -72,4 +72,5 @@ zulip==0.8.2 hubspot-api-client==8.1.0 zenpy==2.0.41 dropbox==11.36.2 -boto3-stubs[s3]==1.34.133 \ No newline at end of file +boto3-stubs[s3]==1.34.133 +ultimate_sitemap_parser==0.5 diff --git a/web/src/app/admin/connectors/web/page.tsx b/web/src/app/admin/connectors/web/page.tsx index 410d187920e7..06e2a000e347 100644 --- a/web/src/app/admin/connectors/web/page.tsx +++ b/web/src/app/admin/connectors/web/page.tsx @@ -98,7 +98,7 @@ export default function Web() { name: "Sitemap", value: "sitemap", description: - "Assumes the URL to Index points to a Sitemap. Will try and index all pages that are a mentioned in the sitemap.", + "Enter the sitemap url or the root of the site which we can scan for a sitemap", }, ]} />