Graceful failure for pages without a navbar links in Google Sites connector

This commit is contained in:
Weves 2023-10-23 23:34:15 -07:00 committed by Chris Weaver
parent 3554e29b8d
commit 816ec5e3ca

View File

@ -15,6 +15,9 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
def process_link(element: BeautifulSoup | Tag) -> str: def process_link(element: BeautifulSoup | Tag) -> str:
@ -93,7 +96,10 @@ class GoogleSitesConnector(LoadConnector):
nav = cast(Tag, header.find("nav")) nav = cast(Tag, header.find("nav"))
path = find_google_sites_page_path_from_navbar(nav, "", True) path = find_google_sites_page_path_from_navbar(nav, "", True)
if not path: if not path:
raise RuntimeError(f"Could not find path for {file_info.filename}") logger.error(
f"Could not find path for '{file_info.filename}'. "
+ "This page will not have a working link."
)
# cleanup the hidden `Skip to main content` and `Skip to navigation` that # cleanup the hidden `Skip to main content` and `Skip to navigation` that
# appears at the top of every page # appears at the top of every page
@ -113,7 +119,9 @@ class GoogleSitesConnector(LoadConnector):
semantic_identifier=title, semantic_identifier=title,
sections=[ sections=[
Section( Section(
link=self.base_url.rstrip("/") + "/" + path.lstrip("/"), link=(self.base_url.rstrip("/") + "/" + path.lstrip("/"))
if path
else "",
text=parsed_html.cleaned_text, text=parsed_html.cleaned_text,
) )
], ],