mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-13 06:32:57 +02:00
fix for url parsing google site
This commit is contained in:
committed by
Chris Weaver
parent
68160d49dd
commit
d6a84ab413
@ -1,6 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import urllib.parse
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
@ -21,47 +20,31 @@ from danswer.utils.logger import setup_logger
|
|||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
def process_link(element: BeautifulSoup | Tag) -> str | None:
|
def a_tag_text_to_path(atag: Tag) -> str:
|
||||||
href = cast(str | None, element.get("href"))
|
page_path = atag.text.strip().lower()
|
||||||
if not href:
|
page_path = re.sub(r"[^a-zA-Z0-9\s]", "", page_path)
|
||||||
return None
|
page_path = "-".join(page_path.split())
|
||||||
|
|
||||||
# cleanup href
|
return page_path
|
||||||
href = urllib.parse.unquote(href)
|
|
||||||
href = href.lower()
|
|
||||||
if href.endswith(".html"):
|
|
||||||
href = href[:-5]
|
|
||||||
href = href.replace("_", "")
|
|
||||||
href = re.sub(
|
|
||||||
r"([\s-]+)", "-", href
|
|
||||||
) # replace all whitespace/'-' groups with a single '-'
|
|
||||||
|
|
||||||
return href
|
|
||||||
|
|
||||||
|
|
||||||
def find_google_sites_page_path_from_navbar(
|
def find_google_sites_page_path_from_navbar(
|
||||||
element: BeautifulSoup | Tag, path: str, is_initial: bool
|
element: BeautifulSoup | Tag, path: str, depth: int
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
ul = cast(Tag | None, element.find("ul"))
|
lis = cast(
|
||||||
if ul:
|
list[Tag],
|
||||||
if not is_initial:
|
element.find_all("li", attrs={"data-nav-level": f"{depth}"}),
|
||||||
a = cast(Tag, element.find("a"))
|
)
|
||||||
href = process_link(a)
|
for li in lis:
|
||||||
new_path = f"{path}/{href}"
|
a = cast(Tag, li.find("a"))
|
||||||
if href and a.get("aria-selected") == "true":
|
if a.get("aria-selected") == "true":
|
||||||
return new_path
|
return f"{path}/{a_tag_text_to_path(a)}"
|
||||||
else:
|
elif a.get("aria-expanded") == "true":
|
||||||
new_path = ""
|
sub_path = find_google_sites_page_path_from_navbar(
|
||||||
for li in ul.find_all("li", recursive=False):
|
element, f"{path}/{a_tag_text_to_path(a)}", depth + 1
|
||||||
found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
|
)
|
||||||
if found_link:
|
if sub_path:
|
||||||
return found_link
|
return sub_path
|
||||||
else:
|
|
||||||
a = cast(Tag, element.find("a"))
|
|
||||||
if a:
|
|
||||||
href = process_link(a)
|
|
||||||
if href and a.get("aria-selected") == "true":
|
|
||||||
return path + "/" + href
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -85,6 +68,7 @@ class GoogleSitesConnector(LoadConnector):
|
|||||||
|
|
||||||
# load the HTML files
|
# load the HTML files
|
||||||
files = load_files_from_zip(self.zip_path)
|
files = load_files_from_zip(self.zip_path)
|
||||||
|
count = 0
|
||||||
for file_info, file_io in files:
|
for file_info, file_io in files:
|
||||||
# skip non-published files
|
# skip non-published files
|
||||||
if "/PUBLISHED/" not in file_info.filename:
|
if "/PUBLISHED/" not in file_info.filename:
|
||||||
@ -100,13 +84,15 @@ class GoogleSitesConnector(LoadConnector):
|
|||||||
# get the link out of the navbar
|
# get the link out of the navbar
|
||||||
header = cast(Tag, soup.find("header"))
|
header = cast(Tag, soup.find("header"))
|
||||||
nav = cast(Tag, header.find("nav"))
|
nav = cast(Tag, header.find("nav"))
|
||||||
path = find_google_sites_page_path_from_navbar(nav, "", True)
|
path = find_google_sites_page_path_from_navbar(nav, "", 1)
|
||||||
if not path:
|
if not path:
|
||||||
|
count += 1
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Could not find path for '{file_info.filename}'. "
|
f"Could not find path for '{file_info.filename}'. "
|
||||||
+ "This page will not have a working link."
|
+ "This page will not have a working link.\n\n"
|
||||||
|
+ f"# of broken links so far - {count}"
|
||||||
)
|
)
|
||||||
|
logger.info(f"Path to page: {path}")
|
||||||
# cleanup the hidden `Skip to main content` and `Skip to navigation` that
|
# cleanup the hidden `Skip to main content` and `Skip to navigation` that
|
||||||
# appears at the top of every page
|
# appears at the top of every page
|
||||||
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
|
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
|
||||||
|
Reference in New Issue
Block a user