mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-17 21:32:36 +01:00
added a check for empty URL list in web connector (#1573)
* added a check for empty URL list in web connector * added raise condition for improper sitemap designation
This commit is contained in:
parent
486b0ecb31
commit
e8cfbc1dd8
@ -145,10 +145,16 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
return [
|
||||
result = [
|
||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||
for loc_tag in soup.find_all("loc")
|
||||
]
|
||||
if not result:
|
||||
raise ValueError(
|
||||
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
||||
@ -214,6 +220,10 @@ class WebConnector(LoadConnector):
|
||||
and converts them into documents"""
|
||||
visited_links: set[str] = set()
|
||||
to_visit: list[str] = self.to_visit_list
|
||||
|
||||
if not to_visit:
|
||||
raise ValueError("No URLs to visit")
|
||||
|
||||
base_url = to_visit[0] # For the recursive case
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user