connector: ensure absolute URL integrity (#1196)

This commit is contained in:
teocns 2024-03-10 03:04:05 -06:00 committed by GitHub
parent f292ede85a
commit 19c7ebdc26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -100,10 +100,12 @@ def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
urls = [loc_tag.text for loc_tag in soup.find_all("loc")]
return urls
return [_ensure_absolute_url(sitemap_url, loc_tag.text) for loc_tag in soup.find_all("loc")]
def _ensure_absolute_url(source_url:str, maybe_relative_url: str) -> str:
if not urlparse(maybe_relative_url).netloc:
return urljoin(source_url, maybe_relative_url)
return maybe_relative_url
def _ensure_valid_url(url: str) -> str:
if "://" not in url: