mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
Add Google Sites connector (#532)
This commit is contained in:
@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
|
||||
LINEAR = "linear"
|
||||
HUBSPOT = "hubspot"
|
||||
GONG = "gong"
|
||||
GOOGLE_SITES = "google_sites"
|
||||
|
||||
|
||||
class DocumentIndexType(str, Enum):
|
||||
|
@@ -0,0 +1,49 @@
|
||||
import json
|
||||
import os
|
||||
import zipfile
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
||||
|
||||
|
||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||
"__MACOSX"
|
||||
)
|
||||
|
||||
|
||||
def load_files_from_zip(
|
||||
zip_location: str | Path,
|
||||
ignore_macos_resource_fork_files: bool = True,
|
||||
ignore_dirs: bool = True,
|
||||
) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
|
||||
with zipfile.ZipFile(zip_location, "r") as zip_file:
|
||||
for file_info in zip_file.infolist():
|
||||
with zip_file.open(file_info.filename, "r") as file:
|
||||
if ignore_dirs and file_info.is_dir():
|
||||
continue
|
||||
|
||||
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
||||
file_info.filename
|
||||
):
|
||||
continue
|
||||
yield file_info, file
|
||||
|
||||
|
||||
def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
|
||||
metadata = {}
|
||||
file_content_raw = ""
|
||||
for ind, line in enumerate(file_reader):
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode("utf-8")
|
||||
line = str(line)
|
||||
|
||||
if ind == 0 and line.startswith(_METADATA_FLAG):
|
||||
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
return file_content_raw, metadata
|
@@ -0,0 +1,57 @@
|
||||
from copy import copy
|
||||
from dataclasses import dataclass
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||
from danswer.utils.text_processing import format_document_soup
|
||||
|
||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedHTML:
|
||||
title: str | None
|
||||
cleaned_text: str
|
||||
|
||||
|
||||
def standard_html_cleanup(
|
||||
page_content: str | BeautifulSoup,
|
||||
mintlify_cleanup_enabled: bool = True,
|
||||
additional_element_types_to_discard: list[str] | None = None,
|
||||
) -> ParsedHTML:
|
||||
if isinstance(page_content, str):
|
||||
soup = BeautifulSoup(page_content, "html.parser")
|
||||
else:
|
||||
soup = page_content
|
||||
|
||||
title_tag = soup.find("title")
|
||||
title = None
|
||||
if title_tag and title_tag.text:
|
||||
title = title_tag.text
|
||||
title_tag.extract()
|
||||
|
||||
# Heuristics based cleaning of elements based on css classes
|
||||
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
|
||||
if mintlify_cleanup_enabled:
|
||||
unwanted_classes.extend(MINTLIFY_UNWANTED)
|
||||
for undesired_element in unwanted_classes:
|
||||
[
|
||||
tag.extract()
|
||||
for tag in soup.find_all(
|
||||
class_=lambda x: x and undesired_element in x.split()
|
||||
)
|
||||
]
|
||||
|
||||
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||
|
||||
if additional_element_types_to_discard:
|
||||
for undesired_tag in additional_element_types_to_discard:
|
||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||
|
||||
# 200B is ZeroWidthSpace which we don't care for
|
||||
page_text = format_document_soup(soup).replace("\u200B", "")
|
||||
|
||||
return ParsedHTML(title=title, cleaned_text=page_text)
|
@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
|
||||
from danswer.connectors.github.connector import GithubConnector
|
||||
from danswer.connectors.gong.connector import GongConnector
|
||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from danswer.connectors.google_site.connector import GoogleSitesConnector
|
||||
from danswer.connectors.guru.connector import GuruConnector
|
||||
from danswer.connectors.hubspot.connector import HubSpotConnector
|
||||
from danswer.connectors.interfaces import BaseConnector
|
||||
@@ -54,6 +55,7 @@ def identify_connector_class(
|
||||
DocumentSource.LINEAR: LinearConnector,
|
||||
DocumentSource.HUBSPOT: HubSpotConnector,
|
||||
DocumentSource.GONG: GongConnector,
|
||||
DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
|
||||
}
|
||||
connector_by_source = connector_map.get(source, {})
|
||||
|
||||
|
@@ -1,6 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import zipfile
|
||||
from collections.abc import Generator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||
from danswer.connectors.file.utils import get_file_ext
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
||||
|
||||
|
||||
def _get_files_from_zip(
|
||||
zip_location: str | Path,
|
||||
) -> Generator[tuple[str, IO[Any]], None, None]:
|
||||
with zipfile.ZipFile(zip_location, "r") as zip_file:
|
||||
for file_name in zip_file.namelist():
|
||||
with zip_file.open(file_name, "r") as file:
|
||||
yield os.path.basename(file_name), file
|
||||
|
||||
|
||||
def _open_files_at_location(
|
||||
file_path: str | Path,
|
||||
@@ -39,7 +28,8 @@ def _open_files_at_location(
|
||||
extension = get_file_ext(file_path)
|
||||
|
||||
if extension == ".zip":
|
||||
yield from _get_files_from_zip(file_path)
|
||||
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
|
||||
yield file_info.filename, file
|
||||
elif extension == ".txt" or extension == ".pdf":
|
||||
mode = "r"
|
||||
if extension == ".pdf":
|
||||
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
|
||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||
return []
|
||||
|
||||
metadata = {}
|
||||
metadata: dict[str, Any] = {}
|
||||
file_content_raw = ""
|
||||
if extension == ".pdf":
|
||||
pdf_reader = PdfReader(file)
|
||||
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
|
||||
page.extract_text() for page in pdf_reader.pages
|
||||
)
|
||||
else:
|
||||
for ind, line in enumerate(file):
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode("utf-8")
|
||||
line = str(line)
|
||||
|
||||
if ind == 0 and line.startswith(_METADATA_FLAG):
|
||||
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
|
||||
else:
|
||||
file_content_raw += line
|
||||
file_content_raw, metadata = read_file(file)
|
||||
|
||||
return [
|
||||
Document(
|
||||
|
139
backend/danswer/connectors/google_site/connector.py
Normal file
139
backend/danswer/connectors/google_site/connector.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import os
|
||||
import urllib.parse
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import Tag
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
|
||||
|
||||
def process_link(element: BeautifulSoup | Tag) -> str:
|
||||
href = cast(str | None, element.get("href"))
|
||||
if not href:
|
||||
raise RuntimeError(f"Invalid link - {element}")
|
||||
|
||||
# cleanup href
|
||||
href = urllib.parse.unquote(href)
|
||||
href = href.rstrip(".html").lower()
|
||||
href = href.replace("_", "")
|
||||
href = href.replace(" ", "-")
|
||||
|
||||
return href
|
||||
|
||||
|
||||
def find_google_sites_page_path_from_navbar(
|
||||
element: BeautifulSoup | Tag, path: str, is_initial: bool
|
||||
) -> str | None:
|
||||
ul = cast(Tag | None, element.find("ul"))
|
||||
if ul:
|
||||
if not is_initial:
|
||||
a = cast(Tag, element.find("a"))
|
||||
new_path = f"{path}/{process_link(a)}"
|
||||
if a.get("aria-selected") == "true":
|
||||
return new_path
|
||||
else:
|
||||
new_path = ""
|
||||
for li in ul.find_all("li", recursive=False):
|
||||
found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
|
||||
if found_link:
|
||||
return found_link
|
||||
else:
|
||||
a = cast(Tag, element.find("a"))
|
||||
if a:
|
||||
href = process_link(a)
|
||||
if href and a.get("aria-selected") == "true":
|
||||
return path + "/" + href
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class GoogleSitesConnector(LoadConnector):
|
||||
def __init__(
|
||||
self,
|
||||
zip_path: str,
|
||||
base_url: str,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
):
|
||||
self.zip_path = zip_path
|
||||
self.base_url = base_url
|
||||
self.batch_size = batch_size
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
pass
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
documents: list[Document] = []
|
||||
|
||||
# load the HTML files
|
||||
files = load_files_from_zip(self.zip_path)
|
||||
for file_info, file_io in files:
|
||||
# skip non-published files
|
||||
if "/PUBLISHED/" not in file_info.filename:
|
||||
continue
|
||||
|
||||
file_path, extension = os.path.splitext(file_info.filename)
|
||||
if extension != ".html":
|
||||
continue
|
||||
|
||||
file_content, _ = read_file(file_io)
|
||||
soup = BeautifulSoup(file_content, "html.parser")
|
||||
|
||||
# get the link out of the navbar
|
||||
header = cast(Tag, soup.find("header"))
|
||||
nav = cast(Tag, header.find("nav"))
|
||||
path = find_google_sites_page_path_from_navbar(nav, "", True)
|
||||
if not path:
|
||||
raise RuntimeError(f"Could not find path for {file_info.filename}")
|
||||
|
||||
# cleanup the hidden `Skip to main content` and `Skip to navigation` that
|
||||
# appears at the top of every page
|
||||
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
|
||||
div.extract()
|
||||
|
||||
# get the body of the page
|
||||
parsed_html = standard_html_cleanup(
|
||||
soup, additional_element_types_to_discard=["header", "nav"]
|
||||
)
|
||||
|
||||
title = parsed_html.title or file_path.split("/")[-1]
|
||||
documents.append(
|
||||
Document(
|
||||
id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
|
||||
source=DocumentSource.GOOGLE_SITES,
|
||||
semantic_identifier=title,
|
||||
sections=[
|
||||
Section(
|
||||
link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
|
||||
text=parsed_html.cleaned_text,
|
||||
)
|
||||
],
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
|
||||
if len(documents) >= self.batch_size:
|
||||
yield documents
|
||||
documents = []
|
||||
|
||||
if documents:
|
||||
yield documents
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = GoogleSitesConnector(
|
||||
os.environ["GOOGLE_SITES_ZIP_PATH"],
|
||||
os.environ.get("GOOGLE_SITES_BASE_URL", ""),
|
||||
)
|
||||
for doc_batch in connector.load_from_state():
|
||||
for doc in doc_batch:
|
||||
print(doc)
|
@@ -1,5 +1,4 @@
|
||||
import io
|
||||
from copy import copy
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
|
||||
from requests_oauthlib import OAuth2Session # type:ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.text_processing import format_document_soup
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||
|
||||
|
||||
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
||||
# Given a base site, index everything under that path
|
||||
RECURSIVE = "recursive"
|
||||
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
|
||||
if link not in visited_links:
|
||||
to_visit.append(link)
|
||||
|
||||
title_tag = soup.find("title")
|
||||
title = None
|
||||
if title_tag and title_tag.text:
|
||||
title = title_tag.text
|
||||
title_tag.extract()
|
||||
|
||||
# Heuristics based cleaning of elements based on css classes
|
||||
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
|
||||
if self.mintlify_cleanup:
|
||||
unwanted_classes.extend(MINTLIFY_UNWANTED)
|
||||
for undesired_element in unwanted_classes:
|
||||
[
|
||||
tag.extract()
|
||||
for tag in soup.find_all(
|
||||
class_=lambda x: x and undesired_element in x.split()
|
||||
)
|
||||
]
|
||||
|
||||
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||
|
||||
# 200B is ZeroWidthSpace which we don't care for
|
||||
page_text = format_document_soup(soup).replace("\u200B", "")
|
||||
parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=current_url,
|
||||
sections=[Section(link=current_url, text=page_text)],
|
||||
sections=[
|
||||
Section(link=current_url, text=parsed_html.cleaned_text)
|
||||
],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=title or current_url,
|
||||
semantic_identifier=parsed_html.title or current_url,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
|
Reference in New Issue
Block a user