Add Google Sites connector (#532)

This commit is contained in:
Chris Weaver
2023-10-08 19:20:38 -07:00
committed by GitHub
parent fb1fbbee5c
commit d95da554ea
17 changed files with 561 additions and 77 deletions

View File

@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
LINEAR = "linear"
HUBSPOT = "hubspot"
GONG = "gong"
GOOGLE_SITES = "google_sites"
class DocumentIndexType(str, Enum):

View File

@@ -0,0 +1,49 @@
import json
import os
import zipfile
from collections.abc import Generator
from pathlib import Path
from typing import Any
from typing import IO
_METADATA_FLAG = "#DANSWER_METADATA="
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"
)
def load_files_from_zip(
zip_location: str | Path,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
with zipfile.ZipFile(zip_location, "r") as zip_file:
for file_info in zip_file.infolist():
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
file_info.filename
):
continue
yield file_info, file
def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file_reader):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = str(line)
if ind == 0 and line.startswith(_METADATA_FLAG):
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
else:
file_content_raw += line
return file_content_raw, metadata

View File

@@ -0,0 +1,57 @@
from copy import copy
from dataclasses import dataclass
from bs4 import BeautifulSoup
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.utils.text_processing import format_document_soup
MINTLIFY_UNWANTED = ["sticky", "hidden"]
@dataclass
class ParsedHTML:
title: str | None
cleaned_text: str
def standard_html_cleanup(
page_content: str | BeautifulSoup,
mintlify_cleanup_enabled: bool = True,
additional_element_types_to_discard: list[str] | None = None,
) -> ParsedHTML:
if isinstance(page_content, str):
soup = BeautifulSoup(page_content, "html.parser")
else:
soup = page_content
title_tag = soup.find("title")
title = None
if title_tag and title_tag.text:
title = title_tag.text
title_tag.extract()
# Heuristics based cleaning of elements based on css classes
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
if mintlify_cleanup_enabled:
unwanted_classes.extend(MINTLIFY_UNWANTED)
for undesired_element in unwanted_classes:
[
tag.extract()
for tag in soup.find_all(
class_=lambda x: x and undesired_element in x.split()
)
]
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)]
if additional_element_types_to_discard:
for undesired_tag in additional_element_types_to_discard:
[tag.extract() for tag in soup.find_all(undesired_tag)]
# 200B is ZeroWidthSpace which we don't care for
page_text = format_document_soup(soup).replace("\u200B", "")
return ParsedHTML(title=title, cleaned_text=page_text)

View File

@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
from danswer.connectors.github.connector import GithubConnector
from danswer.connectors.gong.connector import GongConnector
from danswer.connectors.google_drive.connector import GoogleDriveConnector
from danswer.connectors.google_site.connector import GoogleSitesConnector
from danswer.connectors.guru.connector import GuruConnector
from danswer.connectors.hubspot.connector import HubSpotConnector
from danswer.connectors.interfaces import BaseConnector
@@ -54,6 +55,7 @@ def identify_connector_class(
DocumentSource.LINEAR: LinearConnector,
DocumentSource.HUBSPOT: HubSpotConnector,
DocumentSource.GONG: GongConnector,
DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
}
connector_by_source = connector_map.get(source, {})

View File

@@ -1,6 +1,4 @@
import json
import os
import zipfile
from collections.abc import Generator
from pathlib import Path
from typing import Any
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.file.utils import check_file_ext_is_valid
from danswer.connectors.file.utils import get_file_ext
from danswer.connectors.interfaces import GenerateDocumentsOutput
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger
logger = setup_logger()
_METADATA_FLAG = "#DANSWER_METADATA="
def _get_files_from_zip(
zip_location: str | Path,
) -> Generator[tuple[str, IO[Any]], None, None]:
with zipfile.ZipFile(zip_location, "r") as zip_file:
for file_name in zip_file.namelist():
with zip_file.open(file_name, "r") as file:
yield os.path.basename(file_name), file
def _open_files_at_location(
file_path: str | Path,
@@ -39,7 +28,8 @@ def _open_files_at_location(
extension = get_file_ext(file_path)
if extension == ".zip":
yield from _get_files_from_zip(file_path)
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
yield file_info.filename, file
elif extension == ".txt" or extension == ".pdf":
mode = "r"
if extension == ".pdf":
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
return []
metadata = {}
metadata: dict[str, Any] = {}
file_content_raw = ""
if extension == ".pdf":
pdf_reader = PdfReader(file)
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
page.extract_text() for page in pdf_reader.pages
)
else:
for ind, line in enumerate(file):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = str(line)
if ind == 0 and line.startswith(_METADATA_FLAG):
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
else:
file_content_raw += line
file_content_raw, metadata = read_file(file)
return [
Document(

View File

@@ -0,0 +1,139 @@
import os
import urllib.parse
from typing import Any
from typing import cast
from bs4 import BeautifulSoup
from bs4 import Tag
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
def process_link(element: BeautifulSoup | Tag) -> str:
href = cast(str | None, element.get("href"))
if not href:
raise RuntimeError(f"Invalid link - {element}")
# cleanup href
href = urllib.parse.unquote(href)
href = href.rstrip(".html").lower()
href = href.replace("_", "")
href = href.replace(" ", "-")
return href
def find_google_sites_page_path_from_navbar(
element: BeautifulSoup | Tag, path: str, is_initial: bool
) -> str | None:
ul = cast(Tag | None, element.find("ul"))
if ul:
if not is_initial:
a = cast(Tag, element.find("a"))
new_path = f"{path}/{process_link(a)}"
if a.get("aria-selected") == "true":
return new_path
else:
new_path = ""
for li in ul.find_all("li", recursive=False):
found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
if found_link:
return found_link
else:
a = cast(Tag, element.find("a"))
if a:
href = process_link(a)
if href and a.get("aria-selected") == "true":
return path + "/" + href
return None
class GoogleSitesConnector(LoadConnector):
def __init__(
self,
zip_path: str,
base_url: str,
batch_size: int = INDEX_BATCH_SIZE,
):
self.zip_path = zip_path
self.base_url = base_url
self.batch_size = batch_size
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
pass
def load_from_state(self) -> GenerateDocumentsOutput:
documents: list[Document] = []
# load the HTML files
files = load_files_from_zip(self.zip_path)
for file_info, file_io in files:
# skip non-published files
if "/PUBLISHED/" not in file_info.filename:
continue
file_path, extension = os.path.splitext(file_info.filename)
if extension != ".html":
continue
file_content, _ = read_file(file_io)
soup = BeautifulSoup(file_content, "html.parser")
# get the link out of the navbar
header = cast(Tag, soup.find("header"))
nav = cast(Tag, header.find("nav"))
path = find_google_sites_page_path_from_navbar(nav, "", True)
if not path:
raise RuntimeError(f"Could not find path for {file_info.filename}")
# cleanup the hidden `Skip to main content` and `Skip to navigation` that
# appears at the top of every page
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
div.extract()
# get the body of the page
parsed_html = standard_html_cleanup(
soup, additional_element_types_to_discard=["header", "nav"]
)
title = parsed_html.title or file_path.split("/")[-1]
documents.append(
Document(
id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
source=DocumentSource.GOOGLE_SITES,
semantic_identifier=title,
sections=[
Section(
link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
text=parsed_html.cleaned_text,
)
],
metadata={},
)
)
if len(documents) >= self.batch_size:
yield documents
documents = []
if documents:
yield documents
if __name__ == "__main__":
connector = GoogleSitesConnector(
os.environ["GOOGLE_SITES_ZIP_PATH"],
os.environ.get("GOOGLE_SITES_BASE_URL", ""),
)
for doc_batch in connector.load_from_state():
for doc in doc_batch:
print(doc)

View File

@@ -1,5 +1,4 @@
import io
from copy import copy
from datetime import datetime
from enum import Enum
from typing import Any
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
from requests_oauthlib import OAuth2Session # type:ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import format_document_soup
logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"]
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
# Given a base site, index everything under that path
RECURSIVE = "recursive"
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
if link not in visited_links:
to_visit.append(link)
title_tag = soup.find("title")
title = None
if title_tag and title_tag.text:
title = title_tag.text
title_tag.extract()
# Heuristics based cleaning of elements based on css classes
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
if self.mintlify_cleanup:
unwanted_classes.extend(MINTLIFY_UNWANTED)
for undesired_element in unwanted_classes:
[
tag.extract()
for tag in soup.find_all(
class_=lambda x: x and undesired_element in x.split()
)
]
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)]
# 200B is ZeroWidthSpace which we don't care for
page_text = format_document_soup(soup).replace("\u200B", "")
parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
doc_batch.append(
Document(
id=current_url,
sections=[Section(link=current_url, text=page_text)],
sections=[
Section(link=current_url, text=parsed_html.cleaned_text)
],
source=DocumentSource.WEB,
semantic_identifier=title or current_url,
semantic_identifier=parsed_html.title or current_url,
metadata={},
)
)