From d95da554ea2955a8610fac440938072ddcf0cf4e Mon Sep 17 00:00:00 2001
From: Chris Weaver <25087905+Weves@users.noreply.github.com>
Date: Sun, 8 Oct 2023 19:20:38 -0700
Subject: [PATCH] Add Google Sites connector (#532)

---
 backend/danswer/configs/constants.py          |   1 +
 .../cross_connector_utils/file_utils.py       |  49 ++++
 .../cross_connector_utils/html_utils.py       |  57 +++++
 backend/danswer/connectors/factory.py         |   2 +
 backend/danswer/connectors/file/connector.py  |  30 +--
 .../connectors/google_site/connector.py       | 139 ++++++++++
 backend/danswer/connectors/web/connector.py   |  38 +--
 web/public/GoogleSites.png                    | Bin 0 -> 5539 bytes
 web/src/app/admin/connectors/file/page.tsx    |   2 +-
 .../admin/connectors/google-sites/page.tsx    | 241 ++++++++++++++++++
 web/src/components/admin/Layout.tsx           |  10 +
 .../admin/connectors}/FileUpload.tsx          |  10 +-
 .../table/SingleUseConnectorsTable.tsx        |  28 +-
 web/src/components/icons/icons.tsx            |  15 ++
 web/src/components/search/Filters.tsx         |   1 +
 web/src/components/source.tsx                 |   7 +
 web/src/lib/types.ts                          |   8 +-
 17 files changed, 561 insertions(+), 77 deletions(-)
 create mode 100644 backend/danswer/connectors/cross_connector_utils/file_utils.py
 create mode 100644 backend/danswer/connectors/cross_connector_utils/html_utils.py
 create mode 100644 backend/danswer/connectors/google_site/connector.py
 create mode 100644 web/public/GoogleSites.png
 create mode 100644 web/src/app/admin/connectors/google-sites/page.tsx
 rename web/src/{app/admin/connectors/file => components/admin/connectors}/FileUpload.tsx (87%)

diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index e651a2e86d23..319170a8bc19 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
     LINEAR = "linear"
     HUBSPOT = "hubspot"
     GONG = "gong"
+    GOOGLE_SITES = "google_sites"
 
 
 class DocumentIndexType(str, Enum):
diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py
new file mode 100644
index 000000000000..89360f39de89
--- /dev/null
+++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py
@@ -0,0 +1,49 @@
+import json
+import os
+import zipfile
+from collections.abc import Generator
+from pathlib import Path
+from typing import Any
+from typing import IO
+
+_METADATA_FLAG = "#DANSWER_METADATA="
+
+
+def is_macos_resource_fork_file(file_name: str) -> bool:
+    return os.path.basename(file_name).startswith("._") and file_name.startswith(
+        "__MACOSX"
+    )
+
+
+def load_files_from_zip(
+    zip_location: str | Path,
+    ignore_macos_resource_fork_files: bool = True,
+    ignore_dirs: bool = True,
+) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
+    with zipfile.ZipFile(zip_location, "r") as zip_file:
+        for file_info in zip_file.infolist():
+            with zip_file.open(file_info.filename, "r") as file:
+                if ignore_dirs and file_info.is_dir():
+                    continue
+
+                if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
+                    file_info.filename
+                ):
+                    continue
+                yield file_info, file
+
+
+def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
+    metadata = {}
+    file_content_raw = ""
+    for ind, line in enumerate(file_reader):
+        if isinstance(line, bytes):
+            line = line.decode("utf-8")
+        line = str(line)
+
+        if ind == 0 and line.startswith(_METADATA_FLAG):
+            metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
+        else:
+            file_content_raw += line
+
+    return file_content_raw, metadata
diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/connectors/cross_connector_utils/html_utils.py
new file mode 100644
index 000000000000..ef860fe1f596
--- /dev/null
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -0,0 +1,57 @@
+from copy import copy
+from dataclasses import dataclass
+
+from bs4 import BeautifulSoup
+
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
+from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
+from danswer.utils.text_processing import format_document_soup
+
+MINTLIFY_UNWANTED = ["sticky", "hidden"]
+
+
+@dataclass
+class ParsedHTML:
+    title: str | None
+    cleaned_text: str
+
+
+def standard_html_cleanup(
+    page_content: str | BeautifulSoup,
+    mintlify_cleanup_enabled: bool = True,
+    additional_element_types_to_discard: list[str] | None = None,
+) -> ParsedHTML:
+    if isinstance(page_content, str):
+        soup = BeautifulSoup(page_content, "html.parser")
+    else:
+        soup = page_content
+
+    title_tag = soup.find("title")
+    title = None
+    if title_tag and title_tag.text:
+        title = title_tag.text
+        title_tag.extract()
+
+    # Heuristics based cleaning of elements based on css classes
+    unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
+    if mintlify_cleanup_enabled:
+        unwanted_classes.extend(MINTLIFY_UNWANTED)
+    for undesired_element in unwanted_classes:
+        [
+            tag.extract()
+            for tag in soup.find_all(
+                class_=lambda x: x and undesired_element in x.split()
+            )
+        ]
+
+    for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
+        [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    if additional_element_types_to_discard:
+        for undesired_tag in additional_element_types_to_discard:
+            [tag.extract() for tag in soup.find_all(undesired_tag)]
+
+    # 200B is ZeroWidthSpace which we don't care for
+    page_text = format_document_soup(soup).replace("\u200B", "")
+
+    return ParsedHTML(title=title, cleaned_text=page_text)
diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py
index 101c8d1b6d7a..2d3af2003b50 100644
--- a/backend/danswer/connectors/factory.py
+++ b/backend/danswer/connectors/factory.py
@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
 from danswer.connectors.github.connector import GithubConnector
 from danswer.connectors.gong.connector import GongConnector
 from danswer.connectors.google_drive.connector import GoogleDriveConnector
+from danswer.connectors.google_site.connector import GoogleSitesConnector
 from danswer.connectors.guru.connector import GuruConnector
 from danswer.connectors.hubspot.connector import HubSpotConnector
 from danswer.connectors.interfaces import BaseConnector
@@ -54,6 +55,7 @@ def identify_connector_class(
         DocumentSource.LINEAR: LinearConnector,
         DocumentSource.HUBSPOT: HubSpotConnector,
         DocumentSource.GONG: GongConnector,
+        DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
     }
     connector_by_source = connector_map.get(source, {})
 
diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py
index e31410cf74a2..d3097249f958 100644
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,6 +1,4 @@
-import json
 import os
-import zipfile
 from collections.abc import Generator
 from pathlib import Path
 from typing import Any
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
 from danswer.connectors.file.utils import check_file_ext_is_valid
 from danswer.connectors.file.utils import get_file_ext
 from danswer.connectors.interfaces import GenerateDocumentsOutput
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger
 
 logger = setup_logger()
 
-_METADATA_FLAG = "#DANSWER_METADATA="
-
-
-def _get_files_from_zip(
-    zip_location: str | Path,
-) -> Generator[tuple[str, IO[Any]], None, None]:
-    with zipfile.ZipFile(zip_location, "r") as zip_file:
-        for file_name in zip_file.namelist():
-            with zip_file.open(file_name, "r") as file:
-                yield os.path.basename(file_name), file
-
 
 def _open_files_at_location(
     file_path: str | Path,
@@ -39,7 +28,8 @@ def _open_files_at_location(
     extension = get_file_ext(file_path)
 
     if extension == ".zip":
-        yield from _get_files_from_zip(file_path)
+        for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
+            yield file_info.filename, file
     elif extension == ".txt" or extension == ".pdf":
         mode = "r"
         if extension == ".pdf":
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
         logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
         return []
 
-    metadata = {}
+    metadata: dict[str, Any] = {}
     file_content_raw = ""
     if extension == ".pdf":
         pdf_reader = PdfReader(file)
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
                 page.extract_text() for page in pdf_reader.pages
             )
     else:
-        for ind, line in enumerate(file):
-            if isinstance(line, bytes):
-                line = line.decode("utf-8")
-            line = str(line)
-
-            if ind == 0 and line.startswith(_METADATA_FLAG):
-                metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
-            else:
-                file_content_raw += line
+        file_content_raw, metadata = read_file(file)
 
     return [
         Document(
diff --git a/backend/danswer/connectors/google_site/connector.py b/backend/danswer/connectors/google_site/connector.py
new file mode 100644
index 000000000000..f064db98fd0a
--- /dev/null
+++ b/backend/danswer/connectors/google_site/connector.py
@@ -0,0 +1,139 @@
+import os
+import urllib.parse
+from typing import Any
+from typing import cast
+
+from bs4 import BeautifulSoup
+from bs4 import Tag
+
+from danswer.configs.app_configs import INDEX_BATCH_SIZE
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
+from danswer.connectors.cross_connector_utils.file_utils import read_file
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
+from danswer.connectors.interfaces import GenerateDocumentsOutput
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+
+
+def process_link(element: BeautifulSoup | Tag) -> str:
+    href = cast(str | None, element.get("href"))
+    if not href:
+        raise RuntimeError(f"Invalid link - {element}")
+
+    # cleanup href
+    href = urllib.parse.unquote(href)
+    href = href.rstrip(".html").lower()
+    href = href.replace("_", "")
+    href = href.replace(" ", "-")
+
+    return href
+
+
+def find_google_sites_page_path_from_navbar(
+    element: BeautifulSoup | Tag, path: str, is_initial: bool
+) -> str | None:
+    ul = cast(Tag | None, element.find("ul"))
+    if ul:
+        if not is_initial:
+            a = cast(Tag, element.find("a"))
+            new_path = f"{path}/{process_link(a)}"
+            if a.get("aria-selected") == "true":
+                return new_path
+        else:
+            new_path = ""
+        for li in ul.find_all("li", recursive=False):
+            found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
+            if found_link:
+                return found_link
+    else:
+        a = cast(Tag, element.find("a"))
+        if a:
+            href = process_link(a)
+            if href and a.get("aria-selected") == "true":
+                return path + "/" + href
+
+    return None
+
+
+class GoogleSitesConnector(LoadConnector):
+    def __init__(
+        self,
+        zip_path: str,
+        base_url: str,
+        batch_size: int = INDEX_BATCH_SIZE,
+    ):
+        self.zip_path = zip_path
+        self.base_url = base_url
+        self.batch_size = batch_size
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        pass
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        documents: list[Document] = []
+
+        # load the HTML files
+        files = load_files_from_zip(self.zip_path)
+        for file_info, file_io in files:
+            # skip non-published files
+            if "/PUBLISHED/" not in file_info.filename:
+                continue
+
+            file_path, extension = os.path.splitext(file_info.filename)
+            if extension != ".html":
+                continue
+
+            file_content, _ = read_file(file_io)
+            soup = BeautifulSoup(file_content, "html.parser")
+
+            # get the link out of the navbar
+            header = cast(Tag, soup.find("header"))
+            nav = cast(Tag, header.find("nav"))
+            path = find_google_sites_page_path_from_navbar(nav, "", True)
+            if not path:
+                raise RuntimeError(f"Could not find path for {file_info.filename}")
+
+            # cleanup the hidden `Skip to main content` and `Skip to navigation` that
+            # appears at the top of every page
+            for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
+                div.extract()
+
+            # get the body of the page
+            parsed_html = standard_html_cleanup(
+                soup, additional_element_types_to_discard=["header", "nav"]
+            )
+
+            title = parsed_html.title or file_path.split("/")[-1]
+            documents.append(
+                Document(
+                    id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
+                    source=DocumentSource.GOOGLE_SITES,
+                    semantic_identifier=title,
+                    sections=[
+                        Section(
+                            link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
+                            text=parsed_html.cleaned_text,
+                        )
+                    ],
+                    metadata={},
+                )
+            )
+
+            if len(documents) >= self.batch_size:
+                yield documents
+                documents = []
+
+        if documents:
+            yield documents
+
+
+if __name__ == "__main__":
+    connector = GoogleSitesConnector(
+        os.environ["GOOGLE_SITES_ZIP_PATH"],
+        os.environ.get("GOOGLE_SITES_BASE_URL", ""),
+    )
+    for doc_batch in connector.load_from_state():
+        for doc in doc_batch:
+            print(doc)
diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py
index b208b5e97941..3d56f5c4374a 100644
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-from copy import copy
 from datetime import datetime
 from enum import Enum
 from typing import Any
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
 from requests_oauthlib import OAuth2Session  # type:ignore
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
-from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from danswer.utils.text_processing import format_document_soup
 
 logger = setup_logger()
 
 
-MINTLIFY_UNWANTED = ["sticky", "hidden"]
-
-
 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
     # Given a base site, index everything under that path
     RECURSIVE = "recursive"
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
                         if link not in visited_links:
                             to_visit.append(link)
 
-                title_tag = soup.find("title")
-                title = None
-                if title_tag and title_tag.text:
-                    title = title_tag.text
-                    title_tag.extract()
-
-                # Heuristics based cleaning of elements based on css classes
-                unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
-                if self.mintlify_cleanup:
-                    unwanted_classes.extend(MINTLIFY_UNWANTED)
-                for undesired_element in unwanted_classes:
-                    [
-                        tag.extract()
-                        for tag in soup.find_all(
-                            class_=lambda x: x and undesired_element in x.split()
-                        )
-                    ]
-
-                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
-                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-
-                # 200B is ZeroWidthSpace which we don't care for
-                page_text = format_document_soup(soup).replace("\u200B", "")
+                parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
 
                 doc_batch.append(
                     Document(
                         id=current_url,
-                        sections=[Section(link=current_url, text=page_text)],
+                        sections=[
+                            Section(link=current_url, text=parsed_html.cleaned_text)
+                        ],
                         source=DocumentSource.WEB,
-                        semantic_identifier=title or current_url,
+                        semantic_identifier=parsed_html.title or current_url,
                         metadata={},
                     )
                 )
diff --git a/web/public/GoogleSites.png b/web/public/GoogleSites.png
new file mode 100644
index 0000000000000000000000000000000000000000..a01ab45deef92a45e2efedc9740a350e56c49036
GIT binary patch
literal 5539
zcmeHLc|4Ts9)D-ZU@nr()RC=Q+(Mj+va6_EE$T!w64PQ5B|?@gGnFH0QQbtRRi{m4
zLbO;Wms1$3Ba>o6SwfaD**VYi&O63vIi35t_kQmAeC{9b$MZhF?fd+`-{*N=6J@_@
znVjr*vIv66*)F$Uiy(Lke59qoOb$J+3jC4rUQYEz5aJ~G;C|(W4I+rLgRS+Vb%C#1
zc~6dQbi+NWZ9YWGJme@d|9%Idcr7(UwefXjUQA4&d1E&r^@ka{hvV8-L_M3T?Ed|;
zbk*yQRKq!Or9n^9r|6ZBT(n<t*v~5Cs=?Cqve?6oXQDI}ZB2dKt9r~E`?OY9oy=(;
z_)xlYTbY(;zRA$M%HW_rnm)0*mA)zCr$|e`uFmrQaI)umj|Cn&R=!m)f<8v9j8q7J
zF6T}+`gvu8?*>zwKE9XYs&aCxtGUrr3)k}@!S%0s^R#VZ<6oL&bRF-?<fo>(x+Znq
zX;GUpk<!z4o<yb7J^MenhP9QtI1Z(6IFP${9b-gJ_qZE<Pg%>yCt<fvMm9D_`@P(P
z4+*42>CQ`2;5~mn80(@YPt>X5;_bG^$A@+tlZf_6%vo675r}vaVeH3h{=l^bDZ^c|
zYVx#m%$cbij#21Lom_j@*pxs6M(qS{chS+PrbYnlPS>)1L8tsXJ60c?VDO$jiAv`d
zm6PkX-03l!%)Sl_P>bIrn$<B8qv<Sb;9vJ>U+sVzf%aDkXiIxL`=%P-0C_77>&gw)
zB^HG=%1+q^v3gc$udg@fNA%uWhMN+A&Mb9SH4E4qIyI+Z5wckZh7_i3IPim-ghmD|
z9s54GZ*YkkUW1FiRb9Q>koo*{O+qAc&=?lK!f$d=ylRfOl7(y1jcP4Se8MHyq%K32
z&xCfHTrU#c9I^6X8nQxPaD1m1bKVSedxZ0+UU0$PkcxdgH*KM1_Uybhr!8Z4LqO3U
zL&Hra{DPjN#p=t=(G+Q`99siENUTKB`3wdc+$_D{%hZ|xgKvB7vJ6Sh>GrzY%e;!t
z_=E-XS$68~9x$8AYFd3Qc~0oXLI`iiG-iE|JpPD6l0zB*vNXpf$2*g#?!E?f9Wt>o
ztj{AjktonMjS|#%0N6I6Hs0XwZ5IpB9M(*S1H?HY*xL-Ie_@0|x)}s2XUyvCxJp!?
z2owCMLRs(J%Ya=os$$=7$RkK{2TIOPq|!59vv=1@ph98eCBY)*F(<^(U>PL}gZ<sc
ztvj9i4wC=LktAurtTMV*0vn=-DhW?S^N(5?T-ZgB1Lb?vYUK&l5DzMF-B+woCsXNK
zECeoK1r3MQc6Wt|eP52l^7Iu1jVEGd(ItRbCl#<P5G<Yatm1Y{<Rbu$jTeU^1S-BZ
z43`xx`Nf*!ZXgr3X6dBKoN0=msf3}FPTI5-E+0iLNfP1^T7gu@<buM5Yd@$9Iw9US
z2Kx=SwZeSp@aYn`cH+0kE#CT>@&!&istvo+$xp?TDNzvNvPptIjRT1GOA@ms?FlT>
z*$x?rcsv)<E-EETE3l1AE5z@p!x_xC^Z0BE$=G5wtQFF_qdY<G1aBA$VJed6?Jb-8
zAGtWgH3-@>4%S$ZRy562piK-i_NKvdXSP0|ht)Y4!S(}{kCHKp0HsD80epi5WBC(Q
z=~-2fRM4eQD~Qc%(0f)|_&QA1NCnHXs>^x@!mMaOkq8sYVj=|q073)F1w(XJ=zuT2
zBnif-@sGrfPb`T2lc#C0$7`?vp@_6D;9r9U`pIAeO@eo@l@L5&LNFA}e#rri{a4bg
zS=dq0?x<$oz3if<)V5bVEdYiam+e=ZtUS-NR5%mM21ndXU_u8G?72+vC{ZA0fK!Qr
z2S<|(9AGoh44*MU<D<V~LX$?WrGfKv5>S#I94rRv0*{O?RFxz=n4qRc2dK=LZg?US
z97G@hn#T;l^zWFoSQ9Dm0p~PG3hR!}z{syfM{5TAZ1SmZz2YWnAqg-ipi+!PA)ljk
ziFttlF)eVW{T*x~!LItsej2|o;NBD8aBwevYnq#eF84dQhp0sZ{9VJ-3aBP~n%7Pq
zF(0D@KsVN5&?^#DD0pH;3<-q-{IJdee@&olO;kV_rGOz!5@LR%yaEa^RRKQ%H-d7U
zOOsJi0KH)uFl~Scq5NhFNd%RlAC#jo3zUT@R)hwVEbxOW7it<)*=TMNSSS>cpFoxX
zJ(dReiJJ6@9~78qb(r8ry9D<Vwls)Cl;_{tY;pXbS!XeN7znWchcKX2h)RoEGL~@k
zXhCTao(BRk!FlX7_@tds5?_{sKzJhAjA`*FRDFTcuTvptSAB%h|5b<H8DH8pBI*S@
zSKtSgDZWF-%0ZH(QdbaBa4Pe3tR^*0e{j~JG{5frg1-&#{Yz>3gO%uq#pjPZ?1N<V
zyFttgoR8ukCC2{{zk|Q=%m(+h*r6`GzyILT|K~{a^^X$p0VD2!-i4pJ7R9}O_v7b(
zGmZGwJ)Xbucl!#Oaom4JU#U`3Ttk8qBVTXtGrbJ<{WmmxOh)rA7r$sd)@wfl-14df
zGVhIS(sbwD;DxG9Bxk=Pm74?wzO0qh*lyFC4Xpi=I>a=2Z6qpupFbzs>v-rs2a8&7
ze2gFX_r4Nj(qSF)C<5+o*`dU}F#Ka1F~ImN^7D`k_ns-3sP18PvfeT9G1qeg3-wCb
zv-4~TlNs|*X=l}uTC^zyReR*b;IM<Dto}Wldr_O)j`k}w`4f~G`OcMPDHd%rGvh1^
zoV@LE%Q;!(s?7>Xq4Kj4+gW)pvT8{D{-Ls;S}c>=7qXBUo40rD33{kEy!v)(=Rpag
z)uhzUS^Ak*6e)z#u*Bgd1NsCq<MGCGoW_jcdwTKw&gQ%OiB=pFYHvKb@`Tg1aC&9U
z$kc^wB)`9@yyit#&~3ekxoarHY@C!vX|6HFyy;5DjLiN8vihcv5$lE{H*WOlxp>(1
zKA_a^m!FIH%xdphyg@0aW7sZoDr4a=^+$siw+wnn?NH6V$_F-g>u3@N4Xbat@Gmd&
z<dR=Ur@1DMc;~iwOBiU4RFtLUytBz@2t9FXy=(FaBeyX|g7wg=yR$DcE1z4{zD2oK
zMcxEa=~p;Js(<T3^L%)WcvD+75HV`7Ypw6e%Hp1XTvWV7-U7MoP!i~q9AP;}H@DuD
ziX47py8h0t;O)t+t|2{{MBJ2D)uir-J!y-!DtBFI-&&hshhz8$C!ajk6G$y_T+o{x
zan8)FWj8`Q5@y&NnVfx{uHL)d(XgCPOu*f>%{I$w3m~51)m@|Vna0lQXK&6?K!%Tb
z?!OT7{{4lKxYK3t9W6?lFTCUy=6!!@0(;>3uEl*_@Q0q$zwS`od(m>F$xHDlH`1f9
z0RPm<j+bOMH~z^^{@gQiKBnr%;g#VzmmSt!BdE(g*nv28v{RL9GkHnD737K%Bbrh{
zm1=f^nZ0_}Nj}cmPX;&tmwGQ3i|WF}A^&xudC}X9evu0!RTt(-@Dlo0g&M5(>+I-k
z&P_J#Rf$^f@nWMBvLzstzbeP<?qx!NzkbTml-InTb{@H{Gr@4nh*XvSL~mK|;v(>?
PD`IQ2%KFw~+P>cal(qWD

literal 0
HcmV?d00001

diff --git a/web/src/app/admin/connectors/file/page.tsx b/web/src/app/admin/connectors/file/page.tsx
index 20752069fc28..ad8976d09b39 100644
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@@ -8,7 +8,6 @@ import { fetcher } from "@/lib/fetcher";
 import { HealthCheckBanner } from "@/components/health/healthcheck";
 import { ConnectorIndexingStatus, FileConfig } from "@/lib/types";
 import { linkCredential } from "@/lib/credential";
-import { FileUpload } from "./FileUpload";
 import { useState } from "react";
 import { usePopup } from "@/components/admin/connectors/Popup";
 import { createConnector, runConnector } from "@/lib/connector";
@@ -17,6 +16,7 @@ import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/Si
 import { LoadingAnimation } from "@/components/Loading";
 import { Form, Formik } from "formik";
 import { TextFormField } from "@/components/admin/connectors/Field";
+import { FileUpload } from "@/components/admin/connectors/FileUpload";
 
 const getNameFromPath = (path: string) => {
   const pathParts = path.split("/");
diff --git a/web/src/app/admin/connectors/google-sites/page.tsx b/web/src/app/admin/connectors/google-sites/page.tsx
new file mode 100644
index 000000000000..61e25fcc9ac2
--- /dev/null
+++ b/web/src/app/admin/connectors/google-sites/page.tsx
@@ -0,0 +1,241 @@
+"use client";
+
+import useSWR, { useSWRConfig } from "swr";
+import * as Yup from "yup";
+
+import { LoadingAnimation } from "@/components/Loading";
+import { GoogleSitesIcon } from "@/components/icons/icons";
+import { fetcher } from "@/lib/fetcher";
+import { TextFormField } from "@/components/admin/connectors/Field";
+import { HealthCheckBanner } from "@/components/health/healthcheck";
+import { ConnectorIndexingStatus, GoogleSitesConfig } from "@/lib/types";
+import { Form, Formik } from "formik";
+import { useState } from "react";
+import { usePopup } from "@/components/admin/connectors/Popup";
+import { createConnector, runConnector } from "@/lib/connector";
+import { linkCredential } from "@/lib/credential";
+import { FileUpload } from "@/components/admin/connectors/FileUpload";
+import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/SingleUseConnectorsTable";
+import { Spinner } from "@/components/Spinner";
+
+export default function GoogleSites() {
+  const { mutate } = useSWRConfig();
+  const [selectedFiles, setSelectedFiles] = useState<File[]>([]);
+  const [filesAreUploading, setFilesAreUploading] = useState<boolean>(false);
+  const { popup, setPopup } = usePopup();
+
+  const {
+    data: connectorIndexingStatuses,
+    isLoading: isConnectorIndexingStatusesLoading,
+    error: isConnectorIndexingStatusesError,
+  } = useSWR<ConnectorIndexingStatus<any, any>[]>(
+    "/api/manage/admin/connector/indexing-status",
+    fetcher
+  );
+
+  const googleSitesIndexingStatuses: ConnectorIndexingStatus<
+    GoogleSitesConfig,
+    {}
+  >[] =
+    connectorIndexingStatuses?.filter(
+      (connectorIndexingStatus) =>
+        connectorIndexingStatus.connector.source === "google_sites"
+    ) ?? [];
+
+  return (
+    <>
+      {popup}
+      {filesAreUploading && <Spinner />}
+      <div className="mx-auto container">
+        <div className="mb-4">
+          <HealthCheckBanner />
+        </div>
+        <div className="border-solid border-gray-600 border-b pb-2 mb-4 flex">
+          <GoogleSitesIcon size={32} />
+          <h1 className="text-3xl font-bold pl-2">Google Sites</h1>
+        </div>
+        <p className="text-sm mb-2">
+          For an in-depth guide on how to setup this connector, check out{" "}
+          <a
+            href="https://docs.danswer.dev/connectors/google-sites"
+            target="_blank"
+            className="text-blue-500"
+          >
+            the documentation
+          </a>
+          .
+        </p>
+
+        <div className="mt-4">
+          <h2 className="font-bold text-xl mb-2">Upload Files</h2>
+          <div className="mx-auto w-full">
+            <Formik
+              initialValues={{
+                base_url: "",
+              }}
+              validationSchema={Yup.object().shape({
+                base_url: Yup.string().required("Base URL is required"),
+              })}
+              onSubmit={async (values, formikHelpers) => {
+                const uploadCreateAndTriggerConnector = async () => {
+                  const formData = new FormData();
+
+                  selectedFiles.forEach((file) => {
+                    formData.append("files", file);
+                  });
+
+                  const response = await fetch(
+                    "/api/manage/admin/connector/file/upload",
+                    { method: "POST", body: formData }
+                  );
+                  const responseJson = await response.json();
+                  if (!response.ok) {
+                    setPopup({
+                      message: `Unable to upload files - ${responseJson.detail}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const filePaths = responseJson.file_paths as string[];
+                  const [connectorErrorMsg, connector] =
+                    await createConnector<GoogleSitesConfig>({
+                      name: `GoogleSitesConnector-${values.base_url}`,
+                      source: "google_sites",
+                      input_type: "load_state",
+                      connector_specific_config: {
+                        base_url: values.base_url,
+                        zip_path: filePaths[0],
+                      },
+                      refresh_freq: null,
+                      disabled: false,
+                    });
+                  if (connectorErrorMsg || !connector) {
+                    setPopup({
+                      message: `Unable to create connector - ${connectorErrorMsg}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const credentialResponse = await linkCredential(
+                    connector.id,
+                    0,
+                    values.base_url
+                  );
+                  if (!credentialResponse.ok) {
+                    const credentialResponseJson =
+                      await credentialResponse.json();
+                    setPopup({
+                      message: `Unable to link connector to credential - ${credentialResponseJson.detail}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  const runConnectorErrorMsg = await runConnector(
+                    connector.id,
+                    [0]
+                  );
+                  if (runConnectorErrorMsg) {
+                    setPopup({
+                      message: `Unable to run connector - ${runConnectorErrorMsg}`,
+                      type: "error",
+                    });
+                    return;
+                  }
+
+                  mutate("/api/manage/admin/connector/indexing-status");
+                  setSelectedFiles([]);
+                  formikHelpers.resetForm();
+                  setPopup({
+                    type: "success",
+                    message: "Successfully uploaded files!",
+                  });
+                };
+
+                setFilesAreUploading(true);
+                try {
+                  await uploadCreateAndTriggerConnector();
+                } catch (e) {
+                  console.log("Failed to index filels: ", e);
+                }
+                setFilesAreUploading(false);
+              }}
+            >
+              {({ values, isSubmitting }) => (
+                <Form className="p-3 border border-gray-600 rounded">
+                  <TextFormField
+                    name="base_url"
+                    label="Base URL:"
+                    placeholder={`Base URL of your Google Site e.g. https://sites.google.com/view/your-site`}
+                    subtext="This will be used to generate links for each page."
+                    autoCompleteDisabled={true}
+                  />
+
+                  <p className="mb-1 font-medium">Files:</p>
+                  <FileUpload
+                    selectedFiles={selectedFiles}
+                    setSelectedFiles={setSelectedFiles}
+                    message="Upload a zip file containing the HTML of your Google Site"
+                  />
+                  <button
+                    className={
+                      "bg-slate-500 hover:bg-slate-700 text-white " +
+                      "font-bold py-2 px-4 rounded focus:outline-none " +
+                      "focus:shadow-outline w-full mx-auto mt-4"
+                    }
+                    type="submit"
+                    disabled={
+                      selectedFiles.length !== 1 ||
+                      !values.base_url ||
+                      isSubmitting
+                    }
+                  >
+                    Upload!
+                  </button>
+                </Form>
+              )}
+            </Formik>
+          </div>
+        </div>
+
+        <h2 className="font-bold mb-2 mt-6 ml-auto mr-auto">
+          Existing Google Site Connectors
+        </h2>
+        {isConnectorIndexingStatusesLoading ? (
+          <LoadingAnimation text="Loading" />
+        ) : isConnectorIndexingStatusesError || !connectorIndexingStatuses ? (
+          <div>Error loading indexing history</div>
+        ) : googleSitesIndexingStatuses.length > 0 ? (
+          <SingleUseConnectorsTable<GoogleSitesConfig, {}>
+            connectorIndexingStatuses={googleSitesIndexingStatuses}
+            specialColumns={[
+              {
+                header: "Base URL",
+                key: "base_url",
+                getValue: (ccPairStatus) => {
+                  const connectorConfig =
+                    ccPairStatus.connector.connector_specific_config;
+                  return (
+                    <a
+                      className="text-blue-500"
+                      href={connectorConfig.base_url}
+                    >
+                      {connectorConfig.base_url}
+                    </a>
+                  );
+                },
+              },
+            ]}
+            onUpdate={() =>
+              mutate("/api/manage/admin/connector/indexing-status")
+            }
+          />
+        ) : (
+          <p className="text-sm">No indexed Google Sites found</p>
+        )}
+      </div>
+    </>
+  );
+}
diff --git a/web/src/components/admin/Layout.tsx b/web/src/components/admin/Layout.tsx
index d7add9dacc05..c5dfaeed29da 100644
--- a/web/src/components/admin/Layout.tsx
+++ b/web/src/components/admin/Layout.tsx
@@ -22,6 +22,7 @@ import {
   HubSpotIcon,
   BookmarkIcon,
   CPUIcon,
+  GoogleSitesIcon,
 } from "@/components/icons/icons";
 import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS";
 import { redirect } from "next/navigation";
@@ -173,6 +174,15 @@ export async function Layout({ children }: { children: React.ReactNode }) {
                   ),
                   link: "/admin/connectors/zulip",
                 },
+                {
+                  name: (
+                    <div className="flex">
+                      <GoogleSitesIcon size={16} />
+                      <div className="ml-1">Google Sites</div>
+                    </div>
+                  ),
+                  link: "/admin/connectors/google-sites",
+                },
                 {
                   name: (
                     <div className="flex">
diff --git a/web/src/app/admin/connectors/file/FileUpload.tsx b/web/src/components/admin/connectors/FileUpload.tsx
similarity index 87%
rename from web/src/app/admin/connectors/file/FileUpload.tsx
rename to web/src/components/admin/connectors/FileUpload.tsx
index 1d1f9fbd75d4..9f6a2b0f42bd 100644
--- a/web/src/app/admin/connectors/file/FileUpload.tsx
+++ b/web/src/components/admin/connectors/FileUpload.tsx
@@ -1,16 +1,17 @@
-// components/FileUpload.tsx
-import { ChangeEvent, FC, useState } from "react";
+import { FC, useState } from "react";
 import React from "react";
 import Dropzone from "react-dropzone";
 
 interface FileUploadProps {
   selectedFiles: File[];
   setSelectedFiles: (files: File[]) => void;
+  message?: string;
 }
 
 export const FileUpload: FC<FileUploadProps> = ({
   selectedFiles,
   setSelectedFiles,
+  message,
 }) => {
   const [dragActive, setDragActive] = useState(false);
 
@@ -35,7 +36,10 @@ export const FileUpload: FC<FileUploadProps> = ({
               }
             >
               <input {...getInputProps()} />
-              <b>Drag and drop some files here, or click to select files</b>
+              <b>
+                {message ||
+                  "Drag and drop some files here, or click to select files"}
+              </b>
             </div>
           </section>
         )}
diff --git a/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx b/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx
index 3e0d907b22a7..5a174d1d81ef 100644
--- a/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx
+++ b/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx
@@ -1,10 +1,4 @@
-import {
-  Connector,
-  ConnectorIndexingStatus,
-  Credential,
-  DeletionAttemptSnapshot,
-  ValidStatuses,
-} from "@/lib/types";
+import { DeletionAttemptSnapshot, ValidStatuses } from "@/lib/types";
 import { BasicTable } from "@/components/admin/connectors/BasicTable";
 import { Popup } from "@/components/admin/connectors/Popup";
 import { useState } from "react";
@@ -64,17 +58,19 @@ export function SingleUseConnectorsTable<
   const connectorIncludesCredential =
     getCredential !== undefined && onCredentialLink !== undefined;
 
-  const columns = [
-    {
+  const columns = [];
+
+  if (includeName) {
+    columns.push({
       header: "Name",
       key: "name",
-    },
-    ...(specialColumns ?? []),
-    {
-      header: "Status",
-      key: "status",
-    },
-  ];
+    });
+  }
+  columns.push(...(specialColumns ?? []));
+  columns.push({
+    header: "Status",
+    key: "status",
+  });
   if (connectorIncludesCredential) {
     columns.push({
       header: "Credential",
diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx
index 50858934cfcf..f3be56a7321b 100644
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -43,6 +43,7 @@ import gongIcon from "../../../public/Gong.png";
 import zulipIcon from "../../../public/Zulip.png";
 import linearIcon from "../../../public/Linear.png";
 import hubSpotIcon from "../../../public/HubSpot.png";
+import googleSitesIcon from "../../../public/GoogleSites.png";
 
 interface IconProps {
   size?: number;
@@ -450,3 +451,17 @@ export const HubSpotIcon = ({
     </div>
   );
 };
+
+export const GoogleSitesIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => {
+  return (
+    <div
+      style={{ width: `${size}px`, height: `${size}px` }}
+      className={`w-[${size}px] h-[${size}px] ` + className}
+    >
+      <Image src={googleSitesIcon} alt="Logo" width="96" height="96" />
+    </div>
+  );
+};
diff --git a/web/src/components/search/Filters.tsx b/web/src/components/search/Filters.tsx
index 411cf2efd491..ec5be9b17244 100644
--- a/web/src/components/search/Filters.tsx
+++ b/web/src/components/search/Filters.tsx
@@ -29,6 +29,7 @@ const sources: Source[] = [
   { displayName: "Zulip", internalName: "zulip" },
   { displayName: "Linear", internalName: "linear" },
   { displayName: "HubSpot", internalName: "hubspot" },
+  { displayName: "Google Sites", internalName: "google_sites" },
 ];
 
 interface SourceSelectorProps {
diff --git a/web/src/components/source.tsx b/web/src/components/source.tsx
index 07daccd87474..88744a28af60 100644
--- a/web/src/components/source.tsx
+++ b/web/src/components/source.tsx
@@ -16,6 +16,7 @@ import {
   SlackIcon,
   ZulipIcon,
   HubSpotIcon,
+  GoogleSitesIcon,
 } from "./icons/icons";
 
 interface SourceMetadata {
@@ -122,6 +123,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => {
         displayName: "HubSpot",
         adminPageLink: "/admin/connectors/hubspot",
       };
+    case "google_sites":
+      return {
+        icon: GoogleSitesIcon,
+        displayName: "Google Sites",
+        adminPageLink: "/admin/connectors/google-sites",
+      };
     default:
       throw new Error("Invalid source type");
   }
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 84c5e8074880..6264fa198aae 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -23,7 +23,8 @@ export type ValidSources =
   | "zulip"
   | "linear"
   | "hubspot"
-  | "file";
+  | "file"
+  | "google_sites";
 export type ValidInputTypes = "load_state" | "poll" | "event";
 export type ValidStatuses =
   | "success"
@@ -114,6 +115,11 @@ export interface NotionConfig {}
 
 export interface HubSpotConfig {}
 
+export interface GoogleSitesConfig {
+  zip_path: string;
+  base_url: string;
+}
+
 export interface IndexAttemptSnapshot {
   status: ValidStatuses | null;
   num_docs_indexed: number;