fix for url parsing google site

This commit is contained in:
Sid Ravinutala
2023-11-20 19:52:19 +00:00
committed by Chris Weaver
parent 68160d49dd
commit d6a84ab413

View File

@ -1,6 +1,5 @@
import os import os
import re import re
import urllib.parse
from typing import Any from typing import Any
from typing import cast from typing import cast
@ -21,47 +20,31 @@ from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
def process_link(element: BeautifulSoup | Tag) -> str | None: def a_tag_text_to_path(atag: Tag) -> str:
href = cast(str | None, element.get("href")) page_path = atag.text.strip().lower()
if not href: page_path = re.sub(r"[^a-zA-Z0-9\s]", "", page_path)
return None page_path = "-".join(page_path.split())
# cleanup href return page_path
href = urllib.parse.unquote(href)
href = href.lower()
if href.endswith(".html"):
href = href[:-5]
href = href.replace("_", "")
href = re.sub(
r"([\s-]+)", "-", href
) # replace all whitespace/'-' groups with a single '-'
return href
def find_google_sites_page_path_from_navbar( def find_google_sites_page_path_from_navbar(
element: BeautifulSoup | Tag, path: str, is_initial: bool element: BeautifulSoup | Tag, path: str, depth: int
) -> str | None: ) -> str | None:
ul = cast(Tag | None, element.find("ul")) lis = cast(
if ul: list[Tag],
if not is_initial: element.find_all("li", attrs={"data-nav-level": f"{depth}"}),
a = cast(Tag, element.find("a")) )
href = process_link(a) for li in lis:
new_path = f"{path}/{href}" a = cast(Tag, li.find("a"))
if href and a.get("aria-selected") == "true": if a.get("aria-selected") == "true":
return new_path return f"{path}/{a_tag_text_to_path(a)}"
else: elif a.get("aria-expanded") == "true":
new_path = "" sub_path = find_google_sites_page_path_from_navbar(
for li in ul.find_all("li", recursive=False): element, f"{path}/{a_tag_text_to_path(a)}", depth + 1
found_link = find_google_sites_page_path_from_navbar(li, new_path, False) )
if found_link: if sub_path:
return found_link return sub_path
else:
a = cast(Tag, element.find("a"))
if a:
href = process_link(a)
if href and a.get("aria-selected") == "true":
return path + "/" + href
return None return None
@ -85,6 +68,7 @@ class GoogleSitesConnector(LoadConnector):
# load the HTML files # load the HTML files
files = load_files_from_zip(self.zip_path) files = load_files_from_zip(self.zip_path)
count = 0
for file_info, file_io in files: for file_info, file_io in files:
# skip non-published files # skip non-published files
if "/PUBLISHED/" not in file_info.filename: if "/PUBLISHED/" not in file_info.filename:
@ -100,13 +84,15 @@ class GoogleSitesConnector(LoadConnector):
# get the link out of the navbar # get the link out of the navbar
header = cast(Tag, soup.find("header")) header = cast(Tag, soup.find("header"))
nav = cast(Tag, header.find("nav")) nav = cast(Tag, header.find("nav"))
path = find_google_sites_page_path_from_navbar(nav, "", True) path = find_google_sites_page_path_from_navbar(nav, "", 1)
if not path: if not path:
count += 1
logger.error( logger.error(
f"Could not find path for '{file_info.filename}'. " f"Could not find path for '{file_info.filename}'. "
+ "This page will not have a working link." + "This page will not have a working link.\n\n"
+ f"# of broken links so far - {count}"
) )
logger.info(f"Path to page: {path}")
# cleanup the hidden `Skip to main content` and `Skip to navigation` that # cleanup the hidden `Skip to main content` and `Skip to navigation` that
# appears at the top of every page # appears at the top of every page
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}): for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):