From e8cfbc1dd8b32a19f360fabf9652503a9365d0b0 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Tue, 11 Jun 2024 21:26:44 -0400 Subject: [PATCH] added a check for empty URL list in web connector (#1573) * added a check for empty URL list in web connector * added raise condition for improper sitemap designation --- backend/danswer/connectors/web/connector.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 1a0c7e39d..4e83284cf 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -145,10 +145,16 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]: response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - return [ + result = [ _ensure_absolute_url(sitemap_url, loc_tag.text) for loc_tag in soup.find_all("loc") ] + if not result: + raise ValueError( + f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead." + ) + + return result def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str: @@ -214,6 +220,10 @@ class WebConnector(LoadConnector): and converts them into documents""" visited_links: set[str] = set() to_visit: list[str] = self.to_visit_list + + if not to_visit: + raise ValueError("No URLs to visit") + base_url = to_visit[0] # For the recursive case doc_batch: list[Document] = []