mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
added a check for empty URL list in web connector (#1573)
* added a check for empty URL list in web connector * added raise condition for improper sitemap designation
This commit is contained in:
@@ -145,10 +145,16 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
return [
|
result = [
|
||||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||||
for loc_tag in soup.find_all("loc")
|
for loc_tag in soup.find_all("loc")
|
||||||
]
|
]
|
||||||
|
if not result:
|
||||||
|
raise ValueError(
|
||||||
|
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
||||||
@@ -214,6 +220,10 @@ class WebConnector(LoadConnector):
|
|||||||
and converts them into documents"""
|
and converts them into documents"""
|
||||||
visited_links: set[str] = set()
|
visited_links: set[str] = set()
|
||||||
to_visit: list[str] = self.to_visit_list
|
to_visit: list[str] = self.to_visit_list
|
||||||
|
|
||||||
|
if not to_visit:
|
||||||
|
raise ValueError("No URLs to visit")
|
||||||
|
|
||||||
base_url = to_visit[0] # For the recursive case
|
base_url = to_visit[0] # For the recursive case
|
||||||
doc_batch: list[Document] = []
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user