Add Google Sites connector (#532)

2025-09-27 20:38:32 +02:00 · 2023-10-08 19:20:38 -07:00
parent fb1fbbee5c
commit d95da554ea
17 changed files with 561 additions and 77 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
    LINEAR = "linear"
    HUBSPOT = "hubspot"
    GONG = "gong"
+    GOOGLE_SITES = "google_sites"


 class DocumentIndexType(str, Enum):
--- a/backend/danswer/connectors/cross_connector_utils/file_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py
@@ -0,0 +1,49 @@
+import json
+import os
+import zipfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+from typing import IO
+
+_METADATA_FLAG = "#DANSWER_METADATA="
+
+
+def is_macos_resource_fork_file(file_name: str) -> bool:
+    return os.path.basename(file_name).startswith("._") and file_name.startswith(
+        "__MACOSX"
+    )
+
+
+def load_files_from_zip(
+    zip_location: str | Path,
+    ignore_macos_resource_fork_files: bool = True,
+    ignore_dirs: bool = True,
+) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
+    with zipfile.ZipFile(zip_location, "r") as zip_file:
+        for file_info in zip_file.infolist():
+            with zip_file.open(file_info.filename, "r") as file:
+                if ignore_dirs and file_info.is_dir():
+                    continue
+
+                if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
+                    file_info.filename
+                ):
+                    continue
+                yield file_info, file
+
+
+def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
+    metadata = {}
+    file_content_raw = ""
+    for ind, line in enumerate(file_reader):
+        if isinstance(line, bytes):
+            line = line.decode("utf-8")
+        line = str(line)
+
+        if ind == 0 and line.startswith(_METADATA_FLAG):
+            metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
+        else:
+            file_content_raw += line
+
+    return file_content_raw, metadata
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -0,0 +1,57 @@
+from copy import copy
+from dataclasses import dataclass
+
+from bs4 import BeautifulSoup
+
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
+from danswer.utils.text_processing import format_document_soup
+
+MINTLIFY_UNWANTED = ["sticky", "hidden"]
+
+
+@dataclass
+class ParsedHTML:
+    title: str | None
+    cleaned_text: str
+
+
+def standard_html_cleanup(
+    page_content: str | BeautifulSoup,
+    mintlify_cleanup_enabled: bool = True,
+    additional_element_types_to_discard: list[str] | None = None,
+) -> ParsedHTML:
+    if isinstance(page_content, str):
+        soup = BeautifulSoup(page_content, "html.parser")
+    else:
+        soup = page_content
+
+    title_tag = soup.find("title")
+    title = None
+    if title_tag and title_tag.text:
+        title = title_tag.text
+        title_tag.extract()
+
+    # Heuristics based cleaning of elements based on css classes
+    unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
+    if mintlify_cleanup_enabled:
+        unwanted_classes.extend(MINTLIFY_UNWANTED)
+    for undesired_element in unwanted_classes:
+        [
+            tag.extract()
+            for tag in soup.find_all(
+                class_=lambda x: x and undesired_element in x.split()
+            )
+        ]
+
+    for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
+        [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    if additional_element_types_to_discard:
+        for undesired_tag in additional_element_types_to_discard:
+            [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    # 200B is ZeroWidthSpace which we don't care for
+    page_text = format_document_soup(soup).replace("\u200B", "")
+
+    return ParsedHTML(title=title, cleaned_text=page_text)
--- a/backend/danswer/connectors/factory.py
+++ b/backend/danswer/connectors/factory.py
@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
 from danswer.connectors.github.connector import GithubConnector
 from danswer.connectors.gong.connector import GongConnector
 from danswer.connectors.google_drive.connector import GoogleDriveConnector
+from danswer.connectors.google_site.connector import GoogleSitesConnector
 from danswer.connectors.guru.connector import GuruConnector
 from danswer.connectors.hubspot.connector import HubSpotConnector
 from danswer.connectors.interfaces import BaseConnector
@@ -54,6 +55,7 @@ def identify_connector_class(
        DocumentSource.LINEAR: LinearConnector,
        DocumentSource.HUBSPOT: HubSpotConnector,
        DocumentSource.GONG: GongConnector,
+        DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
    }
    connector_by_source = connector_map.get(source, {})

--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,6 +1,4 @@
-import json
 import os
-import zipfile
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
 from danswer.connectors.file.utils import check_file_ext_is_valid
 from danswer.connectors.file.utils import get_file_ext
 from danswer.connectors.interfaces import GenerateDocumentsOutput
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger

 logger = setup_logger()

-_METADATA_FLAG = "#DANSWER_METADATA="
-
-
-def _get_files_from_zip(
-    zip_location: str | Path,
-) -> Generator[tuple[str, IO[Any]], None, None]:
-    with zipfile.ZipFile(zip_location, "r") as zip_file:
-        for file_name in zip_file.namelist():
-            with zip_file.open(file_name, "r") as file:
-                yield os.path.basename(file_name), file
-

 def _open_files_at_location(
    file_path: str | Path,
@@ -39,7 +28,8 @@ def _open_files_at_location(
    extension = get_file_ext(file_path)

    if extension == ".zip":
-        yield from _get_files_from_zip(file_path)
+        for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
+            yield file_info.filename, file
    elif extension == ".txt" or extension == ".pdf":
        mode = "r"
        if extension == ".pdf":
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
        return []

-    metadata = {}
+    metadata: dict[str, Any] = {}
    file_content_raw = ""
    if extension == ".pdf":
        pdf_reader = PdfReader(file)
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
                page.extract_text() for page in pdf_reader.pages
            )
    else:
-        for ind, line in enumerate(file):
-            if isinstance(line, bytes):
-                line = line.decode("utf-8")
-            line = str(line)
-
-            if ind == 0 and line.startswith(_METADATA_FLAG):
-                metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
-            else:
-                file_content_raw += line
+        file_content_raw, metadata = read_file(file)

    return [
        Document(
--- a/backend/danswer/connectors/google_site/connector.py
+++ b/backend/danswer/connectors/google_site/connector.py
@@ -0,0 +1,139 @@
+import os
+import urllib.parse
+from typing import Any
+from typing import cast
+
+from bs4 import BeautifulSoup
+from bs4 import Tag
+
+from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
+from danswer.connectors.interfaces import GenerateDocumentsOutput
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+
+
+def process_link(element: BeautifulSoup | Tag) -> str:
+    href = cast(str | None, element.get("href"))
+    if not href:
+        raise RuntimeError(f"Invalid link - {element}")
+
+    # cleanup href
+    href = urllib.parse.unquote(href)
+    href = href.rstrip(".html").lower()
+    href = href.replace("_", "")
+    href = href.replace(" ", "-")
+
+    return href
+
+
+def find_google_sites_page_path_from_navbar(
+    element: BeautifulSoup | Tag, path: str, is_initial: bool
+) -> str | None:
+    ul = cast(Tag | None, element.find("ul"))
+    if ul:
+        if not is_initial:
+            a = cast(Tag, element.find("a"))
+            new_path = f"{path}/{process_link(a)}"
+            if a.get("aria-selected") == "true":
+                return new_path
+        else:
+            new_path = ""
+        for li in ul.find_all("li", recursive=False):
+            found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
+            if found_link:
+                return found_link
+    else:
+        a = cast(Tag, element.find("a"))
+        if a:
+            href = process_link(a)
+            if href and a.get("aria-selected") == "true":
+                return path + "/" + href
+
+    return None
+
+
+class GoogleSitesConnector(LoadConnector):
+    def __init__(
+        self,
+        zip_path: str,
+        base_url: str,
+        batch_size: int = INDEX_BATCH_SIZE,
+    ):
+        self.zip_path = zip_path
+        self.base_url = base_url
+        self.batch_size = batch_size
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        pass
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        documents: list[Document] = []
+
+        # load the HTML files
+        files = load_files_from_zip(self.zip_path)
+        for file_info, file_io in files:
+            # skip non-published files
+            if "/PUBLISHED/" not in file_info.filename:
+                continue
+
+            file_path, extension = os.path.splitext(file_info.filename)
+            if extension != ".html":
+                continue
+
+            file_content, _ = read_file(file_io)
+            soup = BeautifulSoup(file_content, "html.parser")
+
+            # get the link out of the navbar
+            header = cast(Tag, soup.find("header"))
+            nav = cast(Tag, header.find("nav"))
+            path = find_google_sites_page_path_from_navbar(nav, "", True)
+            if not path:
+                raise RuntimeError(f"Could not find path for {file_info.filename}")
+
+            # cleanup the hidden `Skip to main content` and `Skip to navigation` that
+            # appears at the top of every page
+            for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
+                div.extract()
+
+            # get the body of the page
+            parsed_html = standard_html_cleanup(
+                soup, additional_element_types_to_discard=["header", "nav"]
+            )
+
+            title = parsed_html.title or file_path.split("/")[-1]
+            documents.append(
+                Document(
+                    id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
+                    source=DocumentSource.GOOGLE_SITES,
+                    semantic_identifier=title,
+                    sections=[
+                        Section(
+                            link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
+                            text=parsed_html.cleaned_text,
+                        )
+                    ],
+                    metadata={},
+                )
+            )
+
+            if len(documents) >= self.batch_size:
+                yield documents
+                documents = []
+
+        if documents:
+            yield documents
+
+
+if __name__ == "__main__":
+    connector = GoogleSitesConnector(
+        os.environ["GOOGLE_SITES_ZIP_PATH"],
+        os.environ.get("GOOGLE_SITES_BASE_URL", ""),
+    )
+    for doc_batch in connector.load_from_state():
+        for doc in doc_batch:
+            print(doc)
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-from copy import copy
 from datetime import datetime
 from enum import Enum
 from typing import Any
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
 from requests_oauthlib import OAuth2Session  # type:ignore

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from danswer.utils.text_processing import format_document_soup

 logger = setup_logger()


-MINTLIFY_UNWANTED = ["sticky", "hidden"]
-
-
 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
    # Given a base site, index everything under that path
    RECURSIVE = "recursive"
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
                        if link not in visited_links:
                            to_visit.append(link)

-                title_tag = soup.find("title")
-                title = None
-                if title_tag and title_tag.text:
-                    title = title_tag.text
-                    title_tag.extract()
-
-                # Heuristics based cleaning of elements based on css classes
-                unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
-                if self.mintlify_cleanup:
-                    unwanted_classes.extend(MINTLIFY_UNWANTED)
-                for undesired_element in unwanted_classes:
-                    [
-                        tag.extract()
-                        for tag in soup.find_all(
-                            class_=lambda x: x and undesired_element in x.split()
-                        )
-                    ]
-
-                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
-                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-
-                # 200B is ZeroWidthSpace which we don't care for
-                page_text = format_document_soup(soup).replace("\u200B", "")
+                parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)

                doc_batch.append(
                    Document(
                        id=current_url,
-                        sections=[Section(link=current_url, text=page_text)],
+                        sections=[
+                            Section(link=current_url, text=parsed_html.cleaned_text)
+                        ],
                        source=DocumentSource.WEB,
-                        semantic_identifier=title or current_url,
+                        semantic_identifier=parsed_html.title or current_url,
                        metadata={},
                    )
                )