Add Google Sites connector (#532)

2025-09-26 20:08:38 +02:00 · 2023-10-08 19:20:38 -07:00
parent fb1fbbee5c
commit d95da554ea
17 changed files with 561 additions and 77 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
    LINEAR = "linear"
    HUBSPOT = "hubspot"
    GONG = "gong"
+    GOOGLE_SITES = "google_sites"


 class DocumentIndexType(str, Enum):
--- a/backend/danswer/connectors/cross_connector_utils/file_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py
@@ -0,0 +1,49 @@
+import json
+import os
+import zipfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+from typing import IO
+
+_METADATA_FLAG = "#DANSWER_METADATA="
+
+
+def is_macos_resource_fork_file(file_name: str) -> bool:
+    return os.path.basename(file_name).startswith("._") and file_name.startswith(
+        "__MACOSX"
+    )
+
+
+def load_files_from_zip(
+    zip_location: str | Path,
+    ignore_macos_resource_fork_files: bool = True,
+    ignore_dirs: bool = True,
+) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
+    with zipfile.ZipFile(zip_location, "r") as zip_file:
+        for file_info in zip_file.infolist():
+            with zip_file.open(file_info.filename, "r") as file:
+                if ignore_dirs and file_info.is_dir():
+                    continue
+
+                if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
+                    file_info.filename
+                ):
+                    continue
+                yield file_info, file
+
+
+def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
+    metadata = {}
+    file_content_raw = ""
+    for ind, line in enumerate(file_reader):
+        if isinstance(line, bytes):
+            line = line.decode("utf-8")
+        line = str(line)
+
+        if ind == 0 and line.startswith(_METADATA_FLAG):
+            metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
+        else:
+            file_content_raw += line
+
+    return file_content_raw, metadata
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -0,0 +1,57 @@
+from copy import copy
+from dataclasses import dataclass
+
+from bs4 import BeautifulSoup
+
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
+from danswer.utils.text_processing import format_document_soup
+
+MINTLIFY_UNWANTED = ["sticky", "hidden"]
+
+
+@dataclass
+class ParsedHTML:
+    title: str | None
+    cleaned_text: str
+
+
+def standard_html_cleanup(
+    page_content: str | BeautifulSoup,
+    mintlify_cleanup_enabled: bool = True,
+    additional_element_types_to_discard: list[str] | None = None,
+) -> ParsedHTML:
+    if isinstance(page_content, str):
+        soup = BeautifulSoup(page_content, "html.parser")
+    else:
+        soup = page_content
+
+    title_tag = soup.find("title")
+    title = None
+    if title_tag and title_tag.text:
+        title = title_tag.text
+        title_tag.extract()
+
+    # Heuristics based cleaning of elements based on css classes
+    unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
+    if mintlify_cleanup_enabled:
+        unwanted_classes.extend(MINTLIFY_UNWANTED)
+    for undesired_element in unwanted_classes:
+        [
+            tag.extract()
+            for tag in soup.find_all(
+                class_=lambda x: x and undesired_element in x.split()
+            )
+        ]
+
+    for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
+        [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    if additional_element_types_to_discard:
+        for undesired_tag in additional_element_types_to_discard:
+            [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    # 200B is ZeroWidthSpace which we don't care for
+    page_text = format_document_soup(soup).replace("\u200B", "")
+
+    return ParsedHTML(title=title, cleaned_text=page_text)
--- a/backend/danswer/connectors/factory.py
+++ b/backend/danswer/connectors/factory.py
@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
 from danswer.connectors.github.connector import GithubConnector
 from danswer.connectors.gong.connector import GongConnector
 from danswer.connectors.google_drive.connector import GoogleDriveConnector
+from danswer.connectors.google_site.connector import GoogleSitesConnector
 from danswer.connectors.guru.connector import GuruConnector
 from danswer.connectors.hubspot.connector import HubSpotConnector
 from danswer.connectors.interfaces import BaseConnector
@@ -54,6 +55,7 @@ def identify_connector_class(
        DocumentSource.LINEAR: LinearConnector,
        DocumentSource.HUBSPOT: HubSpotConnector,
        DocumentSource.GONG: GongConnector,
+        DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
    }
    connector_by_source = connector_map.get(source, {})

--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,6 +1,4 @@
-import json
 import os
-import zipfile
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
 from danswer.connectors.file.utils import check_file_ext_is_valid
 from danswer.connectors.file.utils import get_file_ext
 from danswer.connectors.interfaces import GenerateDocumentsOutput
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger

 logger = setup_logger()

-_METADATA_FLAG = "#DANSWER_METADATA="
-
-
-def _get_files_from_zip(
-    zip_location: str | Path,
-) -> Generator[tuple[str, IO[Any]], None, None]:
-    with zipfile.ZipFile(zip_location, "r") as zip_file:
-        for file_name in zip_file.namelist():
-            with zip_file.open(file_name, "r") as file:
-                yield os.path.basename(file_name), file
-

 def _open_files_at_location(
    file_path: str | Path,
@@ -39,7 +28,8 @@ def _open_files_at_location(
    extension = get_file_ext(file_path)

    if extension == ".zip":
-        yield from _get_files_from_zip(file_path)
+        for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
+            yield file_info.filename, file
    elif extension == ".txt" or extension == ".pdf":
        mode = "r"
        if extension == ".pdf":
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
        return []

-    metadata = {}
+    metadata: dict[str, Any] = {}
    file_content_raw = ""
    if extension == ".pdf":
        pdf_reader = PdfReader(file)
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
                page.extract_text() for page in pdf_reader.pages
            )
    else:
-        for ind, line in enumerate(file):
-            if isinstance(line, bytes):
-                line = line.decode("utf-8")
-            line = str(line)
-
-            if ind == 0 and line.startswith(_METADATA_FLAG):
-                metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
-            else:
-                file_content_raw += line
+        file_content_raw, metadata = read_file(file)

    return [
        Document(
--- a/backend/danswer/connectors/google_site/connector.py
+++ b/backend/danswer/connectors/google_site/connector.py
@@ -0,0 +1,139 @@
+import os
+import urllib.parse
+from typing import Any
+from typing import cast
+
+from bs4 import BeautifulSoup
+from bs4 import Tag
+
+from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
+from danswer.connectors.interfaces import GenerateDocumentsOutput
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+
+
+def process_link(element: BeautifulSoup | Tag) -> str:
+    href = cast(str | None, element.get("href"))
+    if not href:
+        raise RuntimeError(f"Invalid link - {element}")
+
+    # cleanup href
+    href = urllib.parse.unquote(href)
+    href = href.rstrip(".html").lower()
+    href = href.replace("_", "")
+    href = href.replace(" ", "-")
+
+    return href
+
+
+def find_google_sites_page_path_from_navbar(
+    element: BeautifulSoup | Tag, path: str, is_initial: bool
+) -> str | None:
+    ul = cast(Tag | None, element.find("ul"))
+    if ul:
+        if not is_initial:
+            a = cast(Tag, element.find("a"))
+            new_path = f"{path}/{process_link(a)}"
+            if a.get("aria-selected") == "true":
+                return new_path
+        else:
+            new_path = ""
+        for li in ul.find_all("li", recursive=False):
+            found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
+            if found_link:
+                return found_link
+    else:
+        a = cast(Tag, element.find("a"))
+        if a:
+            href = process_link(a)
+            if href and a.get("aria-selected") == "true":
+                return path + "/" + href
+
+    return None
+
+
+class GoogleSitesConnector(LoadConnector):
+    def __init__(
+        self,
+        zip_path: str,
+        base_url: str,
+        batch_size: int = INDEX_BATCH_SIZE,
+    ):
+        self.zip_path = zip_path
+        self.base_url = base_url
+        self.batch_size = batch_size
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        pass
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        documents: list[Document] = []
+
+        # load the HTML files
+        files = load_files_from_zip(self.zip_path)
+        for file_info, file_io in files:
+            # skip non-published files
+            if "/PUBLISHED/" not in file_info.filename:
+                continue
+
+            file_path, extension = os.path.splitext(file_info.filename)
+            if extension != ".html":
+                continue
+
+            file_content, _ = read_file(file_io)
+            soup = BeautifulSoup(file_content, "html.parser")
+
+            # get the link out of the navbar
+            header = cast(Tag, soup.find("header"))
+            nav = cast(Tag, header.find("nav"))
+            path = find_google_sites_page_path_from_navbar(nav, "", True)
+            if not path:
+                raise RuntimeError(f"Could not find path for {file_info.filename}")
+
+            # cleanup the hidden `Skip to main content` and `Skip to navigation` that
+            # appears at the top of every page
+            for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
+                div.extract()
+
+            # get the body of the page
+            parsed_html = standard_html_cleanup(
+                soup, additional_element_types_to_discard=["header", "nav"]
+            )
+
+            title = parsed_html.title or file_path.split("/")[-1]
+            documents.append(
+                Document(
+                    id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
+                    source=DocumentSource.GOOGLE_SITES,
+                    semantic_identifier=title,
+                    sections=[
+                        Section(
+                            link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
+                            text=parsed_html.cleaned_text,
+                        )
+                    ],
+                    metadata={},
+                )
+            )
+
+            if len(documents) >= self.batch_size:
+                yield documents
+                documents = []
+
+        if documents:
+            yield documents
+
+
+if __name__ == "__main__":
+    connector = GoogleSitesConnector(
+        os.environ["GOOGLE_SITES_ZIP_PATH"],
+        os.environ.get("GOOGLE_SITES_BASE_URL", ""),
+    )
+    for doc_batch in connector.load_from_state():
+        for doc in doc_batch:
+            print(doc)
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-from copy import copy
 from datetime import datetime
 from enum import Enum
 from typing import Any
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
 from requests_oauthlib import OAuth2Session  # type:ignore

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from danswer.utils.text_processing import format_document_soup

 logger = setup_logger()


-MINTLIFY_UNWANTED = ["sticky", "hidden"]
-
-
 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
    # Given a base site, index everything under that path
    RECURSIVE = "recursive"
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
                        if link not in visited_links:
                            to_visit.append(link)

-                title_tag = soup.find("title")
-                title = None
-                if title_tag and title_tag.text:
-                    title = title_tag.text
-                    title_tag.extract()
-
-                # Heuristics based cleaning of elements based on css classes
-                unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
-                if self.mintlify_cleanup:
-                    unwanted_classes.extend(MINTLIFY_UNWANTED)
-                for undesired_element in unwanted_classes:
-                    [
-                        tag.extract()
-                        for tag in soup.find_all(
-                            class_=lambda x: x and undesired_element in x.split()
-                        )
-                    ]
-
-                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
-                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-
-                # 200B is ZeroWidthSpace which we don't care for
-                page_text = format_document_soup(soup).replace("\u200B", "")
+                parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)

                doc_batch.append(
                    Document(
                        id=current_url,
-                        sections=[Section(link=current_url, text=page_text)],
+                        sections=[
+                            Section(link=current_url, text=parsed_html.cleaned_text)
+                        ],
                        source=DocumentSource.WEB,
-                        semantic_identifier=title or current_url,
+                        semantic_identifier=parsed_html.title or current_url,
                        metadata={},
                    )
                )
--- a/web/public/GoogleSites.png
+++ b/web/public/GoogleSites.png
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@@ -8,7 +8,6 @@ import { fetcher } from "@/lib/fetcher";
 import { HealthCheckBanner } from "@/components/health/healthcheck";
 import { ConnectorIndexingStatus, FileConfig } from "@/lib/types";
 import { linkCredential } from "@/lib/credential";
-import { FileUpload } from "./FileUpload";
 import { useState } from "react";
 import { usePopup } from "@/components/admin/connectors/Popup";
 import { createConnector, runConnector } from "@/lib/connector";
@@ -17,6 +16,7 @@ import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/Si
 import { LoadingAnimation } from "@/components/Loading";
 import { Form, Formik } from "formik";
 import { TextFormField } from "@/components/admin/connectors/Field";
+import { FileUpload } from "@/components/admin/connectors/FileUpload";

 const getNameFromPath = (path: string) => {
  const pathParts = path.split("/");
--- a/web/src/app/admin/connectors/google-sites/page.tsx
+++ b/web/src/app/admin/connectors/google-sites/page.tsx
@@ -0,0 +1,241 @@
+"use client";
+
+import useSWR, { useSWRConfig } from "swr";
+import * as Yup from "yup";
+
+import { LoadingAnimation } from "@/components/Loading";
+import { GoogleSitesIcon } from "@/components/icons/icons";
+import { fetcher } from "@/lib/fetcher";
+import { TextFormField } from "@/components/admin/connectors/Field";
+import { HealthCheckBanner } from "@/components/health/healthcheck";
+import { ConnectorIndexingStatus, GoogleSitesConfig } from "@/lib/types";
+import { Form, Formik } from "formik";
+import { useState } from "react";
+import { usePopup } from "@/components/admin/connectors/Popup";
+import { createConnector, runConnector } from "@/lib/connector";
+import { linkCredential } from "@/lib/credential";
+import { FileUpload } from "@/components/admin/connectors/FileUpload";
+import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/SingleUseConnectorsTable";
+import { Spinner } from "@/components/Spinner";
+
+export default function GoogleSites() {
+  const { mutate } = useSWRConfig();
+  const [selectedFiles, setSelectedFiles] = useState<File[]>([]);
+  const [filesAreUploading, setFilesAreUploading] = useState<boolean>(false);
+  const { popup, setPopup } = usePopup();
+
+  const {
+    data: connectorIndexingStatuses,
+    isLoading: isConnectorIndexingStatusesLoading,
+    error: isConnectorIndexingStatusesError,
+  } = useSWR<ConnectorIndexingStatus<any, any>[]>(
+    "/api/manage/admin/connector/indexing-status",
+    fetcher
+  );
+
+  const googleSitesIndexingStatuses: ConnectorIndexingStatus<
+    GoogleSitesConfig,
+    {}
+  >[] =
+    connectorIndexingStatuses?.filter(
+      (connectorIndexingStatus) =>
+        connectorIndexingStatus.connector.source === "google_sites"
+    ) ?? [];
+
+  return (
+    <>
+      {popup}
+      {filesAreUploading && <Spinner />}
+      <div className="mx-auto container">
+        <div className="mb-4">
+          <HealthCheckBanner />
+        </div>
+        <div className="border-solid border-gray-600 border-b pb-2 mb-4 flex">
+          <GoogleSitesIcon size={32} />
+          <h1 className="text-3xl font-bold pl-2">Google Sites</h1>
+        </div>
+        <p className="text-sm mb-2">
+          For an in-depth guide on how to setup this connector, check out{" "}
+          <a
+            href="https://docs.danswer.dev/connectors/google-sites"
+            target="_blank"
+            className="text-blue-500"
+          >
+            the documentation
+          </a>
+          .
+        </p>
+
+        <div className="mt-4">
+          <h2 className="font-bold text-xl mb-2">Upload Files</h2>
+          <div className="mx-auto w-full">
+            <Formik
+              initialValues={{
+                base_url: "",
+              }}
+              validationSchema={Yup.object().shape({
+                base_url: Yup.string().required("Base URL is required"),
+              })}
+              onSubmit={async (values, formikHelpers) => {
+                const uploadCreateAndTriggerConnector = async () => {
+                  const formData = new FormData();
+
+                  selectedFiles.forEach((file) => {
+                    formData.append("files", file);
+                  });
+
+                  const response = await fetch(
+                    "/api/manage/admin/connector/file/upload",
+                    { method: "POST", body: formData }
+                  );
+                  const responseJson = await response.json();
+                  if (!response.ok) {
+                    setPopup({
+                      message: `Unable to upload files - ${responseJson.detail}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const filePaths = responseJson.file_paths as string[];
+                  const [connectorErrorMsg, connector] =
+                    await createConnector<GoogleSitesConfig>({
+                      name: `GoogleSitesConnector-${values.base_url}`,
+                      source: "google_sites",
+                      input_type: "load_state",
+                      connector_specific_config: {
+                        base_url: values.base_url,
+                        zip_path: filePaths[0],
+                      },
+                      refresh_freq: null,
+                      disabled: false,
+                    });
+                  if (connectorErrorMsg || !connector) {
+                    setPopup({
+                      message: `Unable to create connector - ${connectorErrorMsg}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const credentialResponse = await linkCredential(
+                    connector.id,
+                    0,
+                    values.base_url
+                  );
+                  if (!credentialResponse.ok) {
+                    const credentialResponseJson =
+                      await credentialResponse.json();
+                    setPopup({
+                      message: `Unable to link connector to credential - ${credentialResponseJson.detail}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const runConnectorErrorMsg = await runConnector(
+                    connector.id,
+                    [0]
+                  );
+                  if (runConnectorErrorMsg) {
+                    setPopup({
+                      message: `Unable to run connector - ${runConnectorErrorMsg}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  mutate("/api/manage/admin/connector/indexing-status");
+                  setSelectedFiles([]);
+                  formikHelpers.resetForm();
+                  setPopup({
+                    type: "success",
+                    message: "Successfully uploaded files!",
+                  });
+                };
+
+                setFilesAreUploading(true);
+                try {
+                  await uploadCreateAndTriggerConnector();
+                } catch (e) {
+                  console.log("Failed to index filels: ", e);
+                }
+                setFilesAreUploading(false);
+              }}
+            >
+              {({ values, isSubmitting }) => (
+                <Form className="p-3 border border-gray-600 rounded">
+                  <TextFormField
+                    name="base_url"
+                    label="Base URL:"
+                    placeholder={`Base URL of your Google Site e.g. https://sites.google.com/view/your-site`}
+                    subtext="This will be used to generate links for each page."
+                    autoCompleteDisabled={true}
+                  />
+
+                  <p className="mb-1 font-medium">Files:</p>
+                  <FileUpload
+                    selectedFiles={selectedFiles}
+                    setSelectedFiles={setSelectedFiles}
+                    message="Upload a zip file containing the HTML of your Google Site"
+                  />
+                  <button
+                    className={
+                      "bg-slate-500 hover:bg-slate-700 text-white " +
+                      "font-bold py-2 px-4 rounded focus:outline-none " +
+                      "focus:shadow-outline w-full mx-auto mt-4"
+                    }
+                    type="submit"
+                    disabled={
+                      selectedFiles.length !== 1 ||
+                      !values.base_url ||
+                      isSubmitting
+                    }
+                  >
+                    Upload!
+                  </button>
+                </Form>
+              )}
+            </Formik>
+          </div>
+        </div>
+
+        <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto">
+          Existing Google Site Connectors
+        </h2>
+        {isConnectorIndexingStatusesLoading ? (
+          <LoadingAnimation text="Loading" />
+        ) : isConnectorIndexingStatusesError || !connectorIndexingStatuses ? (
+          <div>Error loading indexing history</div>
+        ) : googleSitesIndexingStatuses.length > 0 ? (
+          <SingleUseConnectorsTable<GoogleSitesConfig, {}>
+            connectorIndexingStatuses={googleSitesIndexingStatuses}
+            specialColumns={[
+              {
+                header: "Base URL",
+                key: "base_url",
+                getValue: (ccPairStatus) => {
+                  const connectorConfig =
+                    ccPairStatus.connector.connector_specific_config;
+                  return (
+                    <a
+                      className="text-blue-500"
+                      href={connectorConfig.base_url}
+                    >
+                      {connectorConfig.base_url}
+                    </a>
+                  );
+                },
+              },
+            ]}
+            onUpdate={() =>
+              mutate("/api/manage/admin/connector/indexing-status")
+            }
+          />
+        ) : (
+          <p className="text-sm">No indexed Google Sites found</p>
+        )}
+      </div>
+    </>
+  );
+}
--- a/web/src/components/admin/Layout.tsx
+++ b/web/src/components/admin/Layout.tsx
@@ -22,6 +22,7 @@ import {
  HubSpotIcon,
  BookmarkIcon,
  CPUIcon,
+  GoogleSitesIcon,
 } from "@/components/icons/icons";
 import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS";
 import { redirect } from "next/navigation";
@@ -173,6 +174,15 @@ export async function Layout({ children }: { children: React.ReactNode }) {
                  ),
                  link: "/admin/connectors/zulip",
                },
+                {
+                  name: (
+                    <div className="flex">
+                      <GoogleSitesIcon size={16} />
+                      <div className="ml-1">Google Sites</div>
+                    </div>
+                  ),
+                  link: "/admin/connectors/google-sites",
+                },
                {
                  name: (
                    <div className="flex">
--- a/web/src/components/admin/connectors/FileUpload.tsx
+++ b/web/src/components/admin/connectors/FileUpload.tsx
@@ -1,16 +1,17 @@
-// components/FileUpload.tsx
-import { ChangeEvent, FC, useState } from "react";
+import { FC, useState } from "react";
 import React from "react";
 import Dropzone from "react-dropzone";

 interface FileUploadProps {
  selectedFiles: File[];
  setSelectedFiles: (files: File[]) => void;
+  message?: string;
 }

 export const FileUpload: FC<FileUploadProps> = ({
  selectedFiles,
  setSelectedFiles,
+  message,
 }) => {
  const [dragActive, setDragActive] = useState(false);

@@ -35,7 +36,10 @@ export const FileUpload: FC<FileUploadProps> = ({
              }
            >
              <input {...getInputProps()} />
-              <b>Drag and drop some files here, or click to select files</b>
+              <b>
+                {message ||
+                  "Drag and drop some files here, or click to select files"}
+              </b>
            </div>
          </section>
        )}
--- a/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx
+++ b/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx
@@ -1,10 +1,4 @@
-import {
-  Connector,
-  ConnectorIndexingStatus,
-  Credential,
-  DeletionAttemptSnapshot,
-  ValidStatuses,
-} from "@/lib/types";
+import { DeletionAttemptSnapshot, ValidStatuses } from "@/lib/types";
 import { BasicTable } from "@/components/admin/connectors/BasicTable";
 import { Popup } from "@/components/admin/connectors/Popup";
 import { useState } from "react";
@@ -64,17 +58,19 @@ export function SingleUseConnectorsTable<
  const connectorIncludesCredential =
    getCredential !== undefined && onCredentialLink !== undefined;

-  const columns = [
-    {
+  const columns = [];
+
+  if (includeName) {
+    columns.push({
      header: "Name",
      key: "name",
-    },
-    ...(specialColumns ?? []),
-    {
-      header: "Status",
-      key: "status",
-    },
-  ];
+    });
+  }
+  columns.push(...(specialColumns ?? []));
+  columns.push({
+    header: "Status",
+    key: "status",
+  });
  if (connectorIncludesCredential) {
    columns.push({
      header: "Credential",
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -43,6 +43,7 @@ import gongIcon from "../../../public/Gong.png";
 import zulipIcon from "../../../public/Zulip.png";
 import linearIcon from "../../../public/Linear.png";
 import hubSpotIcon from "../../../public/HubSpot.png";
+import googleSitesIcon from "../../../public/GoogleSites.png";

 interface IconProps {
  size?: number;
@@ -450,3 +451,17 @@ export const HubSpotIcon = ({
    </div>
  );
 };
+
+export const GoogleSitesIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => {
+  return (
+    <div
+      style={{ width: `${size}px`, height: `${size}px` }}
+      className={`w-[${size}px] h-[${size}px] ` + className}
+    >
+      <Image src={googleSitesIcon} alt="Logo" width="96" height="96" />
+    </div>
+  );
+};
--- a/web/src/components/search/Filters.tsx
+++ b/web/src/components/search/Filters.tsx
@@ -29,6 +29,7 @@ const sources: Source[] = [
  { displayName: "Zulip", internalName: "zulip" },
  { displayName: "Linear", internalName: "linear" },
  { displayName: "HubSpot", internalName: "hubspot" },
+  { displayName: "Google Sites", internalName: "google_sites" },
 ];

 interface SourceSelectorProps {
--- a/web/src/components/source.tsx
+++ b/web/src/components/source.tsx
@@ -16,6 +16,7 @@ import {
  SlackIcon,
  ZulipIcon,
  HubSpotIcon,
+  GoogleSitesIcon,
 } from "./icons/icons";

 interface SourceMetadata {
@@ -122,6 +123,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => {
        displayName: "HubSpot",
        adminPageLink: "/admin/connectors/hubspot",
      };
+    case "google_sites":
+      return {
+        icon: GoogleSitesIcon,
+        displayName: "Google Sites",
+        adminPageLink: "/admin/connectors/google-sites",
+      };
    default:
      throw new Error("Invalid source type");
  }
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -23,7 +23,8 @@ export type ValidSources =
  | "zulip"
  | "linear"
  | "hubspot"
-  | "file";
+  | "file"
+  | "google_sites";
 export type ValidInputTypes = "load_state" | "poll" | "event";
 export type ValidStatuses =
  | "success"
@@ -114,6 +115,11 @@ export interface NotionConfig {}

 export interface HubSpotConfig {}

+export interface GoogleSitesConfig {
+  zip_path: string;
+  base_url: string;
+}
+
 export interface IndexAttemptSnapshot {
  status: ValidStatuses | null;
  num_docs_indexed: number;