From d95da554ea2955a8610fac440938072ddcf0cf4e Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Sun, 8 Oct 2023 19:20:38 -0700 Subject: [PATCH] Add Google Sites connector (#532) --- backend/danswer/configs/constants.py | 1 + .../cross_connector_utils/file_utils.py | 49 ++++ .../cross_connector_utils/html_utils.py | 57 +++++ backend/danswer/connectors/factory.py | 2 + backend/danswer/connectors/file/connector.py | 30 +-- .../connectors/google_site/connector.py | 139 ++++++++++ backend/danswer/connectors/web/connector.py | 38 +-- web/public/GoogleSites.png | Bin 0 -> 5539 bytes web/src/app/admin/connectors/file/page.tsx | 2 +- .../admin/connectors/google-sites/page.tsx | 241 ++++++++++++++++++ web/src/components/admin/Layout.tsx | 10 + .../admin/connectors}/FileUpload.tsx | 10 +- .../table/SingleUseConnectorsTable.tsx | 28 +- web/src/components/icons/icons.tsx | 15 ++ web/src/components/search/Filters.tsx | 1 + web/src/components/source.tsx | 7 + web/src/lib/types.ts | 8 +- 17 files changed, 561 insertions(+), 77 deletions(-) create mode 100644 backend/danswer/connectors/cross_connector_utils/file_utils.py create mode 100644 backend/danswer/connectors/cross_connector_utils/html_utils.py create mode 100644 backend/danswer/connectors/google_site/connector.py create mode 100644 web/public/GoogleSites.png create mode 100644 web/src/app/admin/connectors/google-sites/page.tsx rename web/src/{app/admin/connectors/file => components/admin/connectors}/FileUpload.tsx (87%) diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index e651a2e86d23..319170a8bc19 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -61,6 +61,7 @@ class DocumentSource(str, Enum): LINEAR = "linear" HUBSPOT = "hubspot" GONG = "gong" + GOOGLE_SITES = "google_sites" class DocumentIndexType(str, Enum): diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py new file mode 100644 index 000000000000..89360f39de89 --- /dev/null +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -0,0 +1,49 @@ +import json +import os +import zipfile +from collections.abc import Generator +from pathlib import Path +from typing import Any +from typing import IO + +_METADATA_FLAG = "#DANSWER_METADATA=" + + +def is_macos_resource_fork_file(file_name: str) -> bool: + return os.path.basename(file_name).startswith("._") and file_name.startswith( + "__MACOSX" + ) + + +def load_files_from_zip( + zip_location: str | Path, + ignore_macos_resource_fork_files: bool = True, + ignore_dirs: bool = True, +) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]: + with zipfile.ZipFile(zip_location, "r") as zip_file: + for file_info in zip_file.infolist(): + with zip_file.open(file_info.filename, "r") as file: + if ignore_dirs and file_info.is_dir(): + continue + + if ignore_macos_resource_fork_files and is_macos_resource_fork_file( + file_info.filename + ): + continue + yield file_info, file + + +def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]: + metadata = {} + file_content_raw = "" + for ind, line in enumerate(file_reader): + if isinstance(line, bytes): + line = line.decode("utf-8") + line = str(line) + + if ind == 0 and line.startswith(_METADATA_FLAG): + metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip()) + else: + file_content_raw += line + + return file_content_raw, metadata diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/connectors/cross_connector_utils/html_utils.py new file mode 100644 index 000000000000..ef860fe1f596 --- /dev/null +++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py @@ -0,0 +1,57 @@ +from copy import copy +from dataclasses import dataclass + +from bs4 import BeautifulSoup + +from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES +from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS +from danswer.utils.text_processing import format_document_soup + +MINTLIFY_UNWANTED = ["sticky", "hidden"] + + +@dataclass +class ParsedHTML: + title: str | None + cleaned_text: str + + +def standard_html_cleanup( + page_content: str | BeautifulSoup, + mintlify_cleanup_enabled: bool = True, + additional_element_types_to_discard: list[str] | None = None, +) -> ParsedHTML: + if isinstance(page_content, str): + soup = BeautifulSoup(page_content, "html.parser") + else: + soup = page_content + + title_tag = soup.find("title") + title = None + if title_tag and title_tag.text: + title = title_tag.text + title_tag.extract() + + # Heuristics based cleaning of elements based on css classes + unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES) + if mintlify_cleanup_enabled: + unwanted_classes.extend(MINTLIFY_UNWANTED) + for undesired_element in unwanted_classes: + [ + tag.extract() + for tag in soup.find_all( + class_=lambda x: x and undesired_element in x.split() + ) + ] + + for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: + [tag.extract() for tag in soup.find_all(undesired_tag)] + + if additional_element_types_to_discard: + for undesired_tag in additional_element_types_to_discard: + [tag.extract() for tag in soup.find_all(undesired_tag)] + + # 200B is ZeroWidthSpace which we don't care for + page_text = format_document_soup(soup).replace("\u200B", "") + + return ParsedHTML(title=title, cleaned_text=page_text) diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 101c8d1b6d7a..2d3af2003b50 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector from danswer.connectors.github.connector import GithubConnector from danswer.connectors.gong.connector import GongConnector from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.google_site.connector import GoogleSitesConnector from danswer.connectors.guru.connector import GuruConnector from danswer.connectors.hubspot.connector import HubSpotConnector from danswer.connectors.interfaces import BaseConnector @@ -54,6 +55,7 @@ def identify_connector_class( DocumentSource.LINEAR: LinearConnector, DocumentSource.HUBSPOT: HubSpotConnector, DocumentSource.GONG: GongConnector, + DocumentSource.GOOGLE_SITES: GoogleSitesConnector, } connector_by_source = connector_map.get(source, {}) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index e31410cf74a2..d3097249f958 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -1,6 +1,4 @@ -import json import os -import zipfile from collections.abc import Generator from pathlib import Path from typing import Any @@ -10,6 +8,8 @@ from PyPDF2 import PdfReader from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip +from danswer.connectors.cross_connector_utils.file_utils import read_file from danswer.connectors.file.utils import check_file_ext_is_valid from danswer.connectors.file.utils import get_file_ext from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger logger = setup_logger() -_METADATA_FLAG = "#DANSWER_METADATA=" - - -def _get_files_from_zip( - zip_location: str | Path, -) -> Generator[tuple[str, IO[Any]], None, None]: - with zipfile.ZipFile(zip_location, "r") as zip_file: - for file_name in zip_file.namelist(): - with zip_file.open(file_name, "r") as file: - yield os.path.basename(file_name), file - def _open_files_at_location( file_path: str | Path, @@ -39,7 +28,8 @@ def _open_files_at_location( extension = get_file_ext(file_path) if extension == ".zip": - yield from _get_files_from_zip(file_path) + for file_info, file in load_files_from_zip(file_path, ignore_dirs=True): + yield file_info.filename, file elif extension == ".txt" or extension == ".pdf": mode = "r" if extension == ".pdf": @@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]: logger.warning(f"Skipping file '{file_name}' with extension '{extension}'") return [] - metadata = {} + metadata: dict[str, Any] = {} file_content_raw = "" if extension == ".pdf": pdf_reader = PdfReader(file) @@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]: page.extract_text() for page in pdf_reader.pages ) else: - for ind, line in enumerate(file): - if isinstance(line, bytes): - line = line.decode("utf-8") - line = str(line) - - if ind == 0 and line.startswith(_METADATA_FLAG): - metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip()) - else: - file_content_raw += line + file_content_raw, metadata = read_file(file) return [ Document( diff --git a/backend/danswer/connectors/google_site/connector.py b/backend/danswer/connectors/google_site/connector.py new file mode 100644 index 000000000000..f064db98fd0a --- /dev/null +++ b/backend/danswer/connectors/google_site/connector.py @@ -0,0 +1,139 @@ +import os +import urllib.parse +from typing import Any +from typing import cast + +from bs4 import BeautifulSoup +from bs4 import Tag + +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip +from danswer.connectors.cross_connector_utils.file_utils import read_file +from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.models import Document +from danswer.connectors.models import Section + + +def process_link(element: BeautifulSoup | Tag) -> str: + href = cast(str | None, element.get("href")) + if not href: + raise RuntimeError(f"Invalid link - {element}") + + # cleanup href + href = urllib.parse.unquote(href) + href = href.rstrip(".html").lower() + href = href.replace("_", "") + href = href.replace(" ", "-") + + return href + + +def find_google_sites_page_path_from_navbar( + element: BeautifulSoup | Tag, path: str, is_initial: bool +) -> str | None: + ul = cast(Tag | None, element.find("ul")) + if ul: + if not is_initial: + a = cast(Tag, element.find("a")) + new_path = f"{path}/{process_link(a)}" + if a.get("aria-selected") == "true": + return new_path + else: + new_path = "" + for li in ul.find_all("li", recursive=False): + found_link = find_google_sites_page_path_from_navbar(li, new_path, False) + if found_link: + return found_link + else: + a = cast(Tag, element.find("a")) + if a: + href = process_link(a) + if href and a.get("aria-selected") == "true": + return path + "/" + href + + return None + + +class GoogleSitesConnector(LoadConnector): + def __init__( + self, + zip_path: str, + base_url: str, + batch_size: int = INDEX_BATCH_SIZE, + ): + self.zip_path = zip_path + self.base_url = base_url + self.batch_size = batch_size + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + pass + + def load_from_state(self) -> GenerateDocumentsOutput: + documents: list[Document] = [] + + # load the HTML files + files = load_files_from_zip(self.zip_path) + for file_info, file_io in files: + # skip non-published files + if "/PUBLISHED/" not in file_info.filename: + continue + + file_path, extension = os.path.splitext(file_info.filename) + if extension != ".html": + continue + + file_content, _ = read_file(file_io) + soup = BeautifulSoup(file_content, "html.parser") + + # get the link out of the navbar + header = cast(Tag, soup.find("header")) + nav = cast(Tag, header.find("nav")) + path = find_google_sites_page_path_from_navbar(nav, "", True) + if not path: + raise RuntimeError(f"Could not find path for {file_info.filename}") + + # cleanup the hidden `Skip to main content` and `Skip to navigation` that + # appears at the top of every page + for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}): + div.extract() + + # get the body of the page + parsed_html = standard_html_cleanup( + soup, additional_element_types_to_discard=["header", "nav"] + ) + + title = parsed_html.title or file_path.split("/")[-1] + documents.append( + Document( + id=f"{DocumentSource.GOOGLE_SITES.value}:{path}", + source=DocumentSource.GOOGLE_SITES, + semantic_identifier=title, + sections=[ + Section( + link=self.base_url.rstrip("/") + "/" + path.lstrip("/"), + text=parsed_html.cleaned_text, + ) + ], + metadata={}, + ) + ) + + if len(documents) >= self.batch_size: + yield documents + documents = [] + + if documents: + yield documents + + +if __name__ == "__main__": + connector = GoogleSitesConnector( + os.environ["GOOGLE_SITES_ZIP_PATH"], + os.environ.get("GOOGLE_SITES_BASE_URL", ""), + ) + for doc_batch in connector.load_from_state(): + for doc in doc_batch: + print(doc) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index b208b5e97941..3d56f5c4374a 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,5 +1,4 @@ import io -from copy import copy from datetime import datetime from enum import Enum from typing import Any @@ -18,25 +17,20 @@ from PyPDF2 import PdfReader from requests_oauthlib import OAuth2Session # type:ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE -from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES -from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger -from danswer.utils.text_processing import format_document_soup logger = setup_logger() -MINTLIFY_UNWANTED = ["sticky", "hidden"] - - class WEB_CONNECTOR_VALID_SETTINGS(str, Enum): # Given a base site, index everything under that path RECURSIVE = "recursive" @@ -224,36 +218,16 @@ class WebConnector(LoadConnector): if link not in visited_links: to_visit.append(link) - title_tag = soup.find("title") - title = None - if title_tag and title_tag.text: - title = title_tag.text - title_tag.extract() - - # Heuristics based cleaning of elements based on css classes - unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES) - if self.mintlify_cleanup: - unwanted_classes.extend(MINTLIFY_UNWANTED) - for undesired_element in unwanted_classes: - [ - tag.extract() - for tag in soup.find_all( - class_=lambda x: x and undesired_element in x.split() - ) - ] - - for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: - [tag.extract() for tag in soup.find_all(undesired_tag)] - - # 200B is ZeroWidthSpace which we don't care for - page_text = format_document_soup(soup).replace("\u200B", "") + parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup) doc_batch.append( Document( id=current_url, - sections=[Section(link=current_url, text=page_text)], + sections=[ + Section(link=current_url, text=parsed_html.cleaned_text) + ], source=DocumentSource.WEB, - semantic_identifier=title or current_url, + semantic_identifier=parsed_html.title or current_url, metadata={}, ) ) diff --git a/web/public/GoogleSites.png b/web/public/GoogleSites.png new file mode 100644 index 0000000000000000000000000000000000000000..a01ab45deef92a45e2efedc9740a350e56c49036 GIT binary patch literal 5539 zcmeHLc|4Ts9)D-ZU@nr()RC=Q+(Mj+va6_EE$T!w64PQ5B|?@gGnFH0QQbtRRi{m4 zLbO;Wms1$3Ba>o6SwfaD**VYi&O63vIi35t_kQmAeC{9b$MZhF?fd+`-{*N=6J@_@ znVjr*vIv66*)F$Uiy(Lke59qoOb$J+3jC4rUQYEz5aJ~G;C|(W4I+rLgRS+Vb%C#1 zc~6dQbi+NWZ9YWGJme@d|9%Idcr7(UwefXjUQA4&d1E&r^@ka{hvV8-L_M3T?Ed|; zbk*yQRKq!Or9n^9r|6ZBT(nzwKE9XYs&aCxtGUrr3)k}@!S%0s^R#VZ<6oL&bRF-?(x+Znq zX;GUpkyCtCQ`2;5~mn80(@YPt>X5;_bG^$A@+tlZf_6%vo675r}vaVeH3h{=l^bDZ^c| zYVx#m%$cbij#21Lom_j@*pxs6M(qS{chS+PrbYnlPS>)1L8tsXJ60c?VDO$jiAv`d zm6PkX-03l!%)Sl_P>bIrn$U+sVzf%aDkXiIxL`=%P-0C_77>&gw) zB^HG=%1+q^v3gc$udg@fNA%uWhMN+A&Mb9SH4E4qIyI+Z5wckZh7_i3IPim-ghmD| z9s54GZ*YkkUW1FiRb9Q>koo*{O+qAc&=?lK!f$d=ylRfOl7(y1jcP4Se8MHyq%K32 z&xCfHTrU#c9I^6X8nQxPaD1m1bKVSedxZ0+UU0$PkcxdgH*KM1_Uybhr!8Z4LqO3U zL&Hra{DPjN#p=t=(G+Q`99siENUTKB`3wdc+$_D{%hZ|xgKvB7vJ6Sh>GrzY%e;!t z_=E-XS$68~9x$8AYFd3Qc~0oXLI`iiG-iE|JpPD6l0zB*vNXpf$2*g#?!E?f9Wt>o ztj{AjktonMjS|#%0N6I6Hs0XwZ5IpB9M(*S1H?HY*xL-Ie_@0|x)}s2XUyvCxJp!? z2owCMLRs(J%Ya=os$$=7$RkK{2TIOPq|!59vv=1@ph98eCBY)*F(<^(U>PL}gZ@&!&istvo+$xp?TDNzvNvPptIjRT1GOA@ms?FlT> z*$x?rcsv)4%S$ZRy562piK-i_NKvdXSP0|ht)Y4!S(}{kCHKp0HsD80epi5WBC(Q z=~-2fRM4eQD~Qc%(0f)|_&QA1NCnHXs>^x@!mMaOkq8sYVj=|q073)F1w(XJ=zuT2 zBnif-@sGrfPb`T2lc#C0$7`?vp@_6D;9r9U`pIAeO@eo@l@L5&LNFA}e#rri{a4bg zS=dq0?x<$oz3if<)V5bVEdYiam+e=ZtUS-NR5%mM21ndXU_u8G?72+vC{ZA0fK!Qr z2S<|(9AGoh44*MUS#I94rRv0*{O?RFxz=n4qRc2dK=LZg?US z97G@hn#T;l^zWFoSQ9Dm0p~PG3hR!}z{syfM{5TAZ1SmZz2YWnAqg-ipi+!PA)ljk ziFttlF)eVW{T*x~!LItsej2|o;NBD8aBwevYnq#eF84dQhp0sZ{9VJ-3aBP~n%7Pq zF(0D@KsVN5&?^#DD0pH;3<-q-{IJdee@&olO;kV_rGOz!5@LR%yaEa^RRKQ%H-d7U zOOsJi0KH)uFl~Scq5NhFNd%RlAC#jo3zUT@R)hwVEbxOW7it<)*=TMNSSS>cpFoxX zJ(dReiJJ6@9~78qb(r8ry9D3gO%uq#pjPZ?1Na=2Z6qpupFbzs>v-rs2a8&7 ze2gFX_r4Nj(qSF)C<5+o*`dU}F#Ka1F~ImN^7D`k_ns-3sP18PvfeT9G1qeg3-wCb zv-4~TlNs|*X=l}uTC^zyReR*b;IMXq4Kj4+gW)pvT8{D{-Ls;S}c>=7qXBUo40rD33{kEy!v)(=Rpag z)uhzUS^Ak*6e)z#u*Bgd1NsCq(1 zKA_a^m!FIH%xdphyg@0aW7sZoDr4a=^+$siw+wnn?NH6V$_F-g>u3@N4Xbat@Gmd& z%{I$w3m~51)m@|Vna0lQXK&6?K!%Tb z?!OT7{{4lKxYK3t9W6?lFTCUy=6!!@0(;>3uEl*_@Q0q$zwS`od(m>F$xHDlH`1f9 z0RPm+I-k z&P_J#Rf$^f@nWMBvLzstzbeP? PD`IQ2%KFw~+P>cal(qWD literal 0 HcmV?d00001 diff --git a/web/src/app/admin/connectors/file/page.tsx b/web/src/app/admin/connectors/file/page.tsx index 20752069fc28..ad8976d09b39 100644 --- a/web/src/app/admin/connectors/file/page.tsx +++ b/web/src/app/admin/connectors/file/page.tsx @@ -8,7 +8,6 @@ import { fetcher } from "@/lib/fetcher"; import { HealthCheckBanner } from "@/components/health/healthcheck"; import { ConnectorIndexingStatus, FileConfig } from "@/lib/types"; import { linkCredential } from "@/lib/credential"; -import { FileUpload } from "./FileUpload"; import { useState } from "react"; import { usePopup } from "@/components/admin/connectors/Popup"; import { createConnector, runConnector } from "@/lib/connector"; @@ -17,6 +16,7 @@ import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/Si import { LoadingAnimation } from "@/components/Loading"; import { Form, Formik } from "formik"; import { TextFormField } from "@/components/admin/connectors/Field"; +import { FileUpload } from "@/components/admin/connectors/FileUpload"; const getNameFromPath = (path: string) => { const pathParts = path.split("/"); diff --git a/web/src/app/admin/connectors/google-sites/page.tsx b/web/src/app/admin/connectors/google-sites/page.tsx new file mode 100644 index 000000000000..61e25fcc9ac2 --- /dev/null +++ b/web/src/app/admin/connectors/google-sites/page.tsx @@ -0,0 +1,241 @@ +"use client"; + +import useSWR, { useSWRConfig } from "swr"; +import * as Yup from "yup"; + +import { LoadingAnimation } from "@/components/Loading"; +import { GoogleSitesIcon } from "@/components/icons/icons"; +import { fetcher } from "@/lib/fetcher"; +import { TextFormField } from "@/components/admin/connectors/Field"; +import { HealthCheckBanner } from "@/components/health/healthcheck"; +import { ConnectorIndexingStatus, GoogleSitesConfig } from "@/lib/types"; +import { Form, Formik } from "formik"; +import { useState } from "react"; +import { usePopup } from "@/components/admin/connectors/Popup"; +import { createConnector, runConnector } from "@/lib/connector"; +import { linkCredential } from "@/lib/credential"; +import { FileUpload } from "@/components/admin/connectors/FileUpload"; +import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/SingleUseConnectorsTable"; +import { Spinner } from "@/components/Spinner"; + +export default function GoogleSites() { + const { mutate } = useSWRConfig(); + const [selectedFiles, setSelectedFiles] = useState([]); + const [filesAreUploading, setFilesAreUploading] = useState(false); + const { popup, setPopup } = usePopup(); + + const { + data: connectorIndexingStatuses, + isLoading: isConnectorIndexingStatusesLoading, + error: isConnectorIndexingStatusesError, + } = useSWR[]>( + "/api/manage/admin/connector/indexing-status", + fetcher + ); + + const googleSitesIndexingStatuses: ConnectorIndexingStatus< + GoogleSitesConfig, + {} + >[] = + connectorIndexingStatuses?.filter( + (connectorIndexingStatus) => + connectorIndexingStatus.connector.source === "google_sites" + ) ?? []; + + return ( + <> + {popup} + {filesAreUploading && } +
+
+ +
+
+ +

Google Sites

+
+

+ For an in-depth guide on how to setup this connector, check out{" "} + + the documentation + + . +

+ +
+

Upload Files

+
+ { + const uploadCreateAndTriggerConnector = async () => { + const formData = new FormData(); + + selectedFiles.forEach((file) => { + formData.append("files", file); + }); + + const response = await fetch( + "/api/manage/admin/connector/file/upload", + { method: "POST", body: formData } + ); + const responseJson = await response.json(); + if (!response.ok) { + setPopup({ + message: `Unable to upload files - ${responseJson.detail}`, + type: "error", + }); + return; + } + + const filePaths = responseJson.file_paths as string[]; + const [connectorErrorMsg, connector] = + await createConnector({ + name: `GoogleSitesConnector-${values.base_url}`, + source: "google_sites", + input_type: "load_state", + connector_specific_config: { + base_url: values.base_url, + zip_path: filePaths[0], + }, + refresh_freq: null, + disabled: false, + }); + if (connectorErrorMsg || !connector) { + setPopup({ + message: `Unable to create connector - ${connectorErrorMsg}`, + type: "error", + }); + return; + } + + const credentialResponse = await linkCredential( + connector.id, + 0, + values.base_url + ); + if (!credentialResponse.ok) { + const credentialResponseJson = + await credentialResponse.json(); + setPopup({ + message: `Unable to link connector to credential - ${credentialResponseJson.detail}`, + type: "error", + }); + return; + } + + const runConnectorErrorMsg = await runConnector( + connector.id, + [0] + ); + if (runConnectorErrorMsg) { + setPopup({ + message: `Unable to run connector - ${runConnectorErrorMsg}`, + type: "error", + }); + return; + } + + mutate("/api/manage/admin/connector/indexing-status"); + setSelectedFiles([]); + formikHelpers.resetForm(); + setPopup({ + type: "success", + message: "Successfully uploaded files!", + }); + }; + + setFilesAreUploading(true); + try { + await uploadCreateAndTriggerConnector(); + } catch (e) { + console.log("Failed to index filels: ", e); + } + setFilesAreUploading(false); + }} + > + {({ values, isSubmitting }) => ( +
+ + +

Files:

+ + + + )} +
+
+
+ +

+ Existing Google Site Connectors +

+ {isConnectorIndexingStatusesLoading ? ( + + ) : isConnectorIndexingStatusesError || !connectorIndexingStatuses ? ( +
Error loading indexing history
+ ) : googleSitesIndexingStatuses.length > 0 ? ( + + connectorIndexingStatuses={googleSitesIndexingStatuses} + specialColumns={[ + { + header: "Base URL", + key: "base_url", + getValue: (ccPairStatus) => { + const connectorConfig = + ccPairStatus.connector.connector_specific_config; + return ( + + {connectorConfig.base_url} + + ); + }, + }, + ]} + onUpdate={() => + mutate("/api/manage/admin/connector/indexing-status") + } + /> + ) : ( +

No indexed Google Sites found

+ )} +
+ + ); +} diff --git a/web/src/components/admin/Layout.tsx b/web/src/components/admin/Layout.tsx index d7add9dacc05..c5dfaeed29da 100644 --- a/web/src/components/admin/Layout.tsx +++ b/web/src/components/admin/Layout.tsx @@ -22,6 +22,7 @@ import { HubSpotIcon, BookmarkIcon, CPUIcon, + GoogleSitesIcon, } from "@/components/icons/icons"; import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS"; import { redirect } from "next/navigation"; @@ -173,6 +174,15 @@ export async function Layout({ children }: { children: React.ReactNode }) { ), link: "/admin/connectors/zulip", }, + { + name: ( +
+ +
Google Sites
+
+ ), + link: "/admin/connectors/google-sites", + }, { name: (
diff --git a/web/src/app/admin/connectors/file/FileUpload.tsx b/web/src/components/admin/connectors/FileUpload.tsx similarity index 87% rename from web/src/app/admin/connectors/file/FileUpload.tsx rename to web/src/components/admin/connectors/FileUpload.tsx index 1d1f9fbd75d4..9f6a2b0f42bd 100644 --- a/web/src/app/admin/connectors/file/FileUpload.tsx +++ b/web/src/components/admin/connectors/FileUpload.tsx @@ -1,16 +1,17 @@ -// components/FileUpload.tsx -import { ChangeEvent, FC, useState } from "react"; +import { FC, useState } from "react"; import React from "react"; import Dropzone from "react-dropzone"; interface FileUploadProps { selectedFiles: File[]; setSelectedFiles: (files: File[]) => void; + message?: string; } export const FileUpload: FC = ({ selectedFiles, setSelectedFiles, + message, }) => { const [dragActive, setDragActive] = useState(false); @@ -35,7 +36,10 @@ export const FileUpload: FC = ({ } > - Drag and drop some files here, or click to select files + + {message || + "Drag and drop some files here, or click to select files"} +
)} diff --git a/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx b/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx index 3e0d907b22a7..5a174d1d81ef 100644 --- a/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx +++ b/web/src/components/admin/connectors/table/SingleUseConnectorsTable.tsx @@ -1,10 +1,4 @@ -import { - Connector, - ConnectorIndexingStatus, - Credential, - DeletionAttemptSnapshot, - ValidStatuses, -} from "@/lib/types"; +import { DeletionAttemptSnapshot, ValidStatuses } from "@/lib/types"; import { BasicTable } from "@/components/admin/connectors/BasicTable"; import { Popup } from "@/components/admin/connectors/Popup"; import { useState } from "react"; @@ -64,17 +58,19 @@ export function SingleUseConnectorsTable< const connectorIncludesCredential = getCredential !== undefined && onCredentialLink !== undefined; - const columns = [ - { + const columns = []; + + if (includeName) { + columns.push({ header: "Name", key: "name", - }, - ...(specialColumns ?? []), - { - header: "Status", - key: "status", - }, - ]; + }); + } + columns.push(...(specialColumns ?? [])); + columns.push({ + header: "Status", + key: "status", + }); if (connectorIncludesCredential) { columns.push({ header: "Credential", diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index 50858934cfcf..f3be56a7321b 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -43,6 +43,7 @@ import gongIcon from "../../../public/Gong.png"; import zulipIcon from "../../../public/Zulip.png"; import linearIcon from "../../../public/Linear.png"; import hubSpotIcon from "../../../public/HubSpot.png"; +import googleSitesIcon from "../../../public/GoogleSites.png"; interface IconProps { size?: number; @@ -450,3 +451,17 @@ export const HubSpotIcon = ({ ); }; + +export const GoogleSitesIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + return ( +
+ Logo +
+ ); +}; diff --git a/web/src/components/search/Filters.tsx b/web/src/components/search/Filters.tsx index 411cf2efd491..ec5be9b17244 100644 --- a/web/src/components/search/Filters.tsx +++ b/web/src/components/search/Filters.tsx @@ -29,6 +29,7 @@ const sources: Source[] = [ { displayName: "Zulip", internalName: "zulip" }, { displayName: "Linear", internalName: "linear" }, { displayName: "HubSpot", internalName: "hubspot" }, + { displayName: "Google Sites", internalName: "google_sites" }, ]; interface SourceSelectorProps { diff --git a/web/src/components/source.tsx b/web/src/components/source.tsx index 07daccd87474..88744a28af60 100644 --- a/web/src/components/source.tsx +++ b/web/src/components/source.tsx @@ -16,6 +16,7 @@ import { SlackIcon, ZulipIcon, HubSpotIcon, + GoogleSitesIcon, } from "./icons/icons"; interface SourceMetadata { @@ -122,6 +123,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => { displayName: "HubSpot", adminPageLink: "/admin/connectors/hubspot", }; + case "google_sites": + return { + icon: GoogleSitesIcon, + displayName: "Google Sites", + adminPageLink: "/admin/connectors/google-sites", + }; default: throw new Error("Invalid source type"); } diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 84c5e8074880..6264fa198aae 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -23,7 +23,8 @@ export type ValidSources = | "zulip" | "linear" | "hubspot" - | "file"; + | "file" + | "google_sites"; export type ValidInputTypes = "load_state" | "poll" | "event"; export type ValidStatuses = | "success" @@ -114,6 +115,11 @@ export interface NotionConfig {} export interface HubSpotConfig {} +export interface GoogleSitesConfig { + zip_path: string; + base_url: string; +} + export interface IndexAttemptSnapshot { status: ValidStatuses | null; num_docs_indexed: number;