added a check for empty URL list in web connector (#1573)

* added a check for empty URL list in web connector

* added raise condition for improper sitemap designation
This commit is contained in:
hagen-danswer 2024-06-11 21:26:44 -04:00 committed by GitHub
parent 486b0ecb31
commit e8cfbc1dd8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -145,10 +145,16 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
return [
result = [
_ensure_absolute_url(sitemap_url, loc_tag.text)
for loc_tag in soup.find_all("loc")
]
if not result:
raise ValueError(
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
)
return result
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
@ -214,6 +220,10 @@ class WebConnector(LoadConnector):
and converts them into documents"""
visited_links: set[str] = set()
to_visit: list[str] = self.to_visit_list
if not to_visit:
raise ValueError("No URLs to visit")
base_url = to_visit[0] # For the recursive case
doc_batch: list[Document] = []