mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 21:05:17 +02:00
Add Google Sites connector (#532)
This commit is contained in:
@@ -61,6 +61,7 @@ class DocumentSource(str, Enum):
|
|||||||
LINEAR = "linear"
|
LINEAR = "linear"
|
||||||
HUBSPOT = "hubspot"
|
HUBSPOT = "hubspot"
|
||||||
GONG = "gong"
|
GONG = "gong"
|
||||||
|
GOOGLE_SITES = "google_sites"
|
||||||
|
|
||||||
|
|
||||||
class DocumentIndexType(str, Enum):
|
class DocumentIndexType(str, Enum):
|
||||||
|
@@ -0,0 +1,49 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
from collections.abc import Generator
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
from typing import IO
|
||||||
|
|
||||||
|
_METADATA_FLAG = "#DANSWER_METADATA="
|
||||||
|
|
||||||
|
|
||||||
|
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||||
|
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||||
|
"__MACOSX"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_files_from_zip(
|
||||||
|
zip_location: str | Path,
|
||||||
|
ignore_macos_resource_fork_files: bool = True,
|
||||||
|
ignore_dirs: bool = True,
|
||||||
|
) -> Generator[tuple[zipfile.ZipInfo, IO[Any]], None, None]:
|
||||||
|
with zipfile.ZipFile(zip_location, "r") as zip_file:
|
||||||
|
for file_info in zip_file.infolist():
|
||||||
|
with zip_file.open(file_info.filename, "r") as file:
|
||||||
|
if ignore_dirs and file_info.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if ignore_macos_resource_fork_files and is_macos_resource_fork_file(
|
||||||
|
file_info.filename
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
yield file_info, file
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(file_reader: IO[Any]) -> tuple[str, dict[str, Any]]:
|
||||||
|
metadata = {}
|
||||||
|
file_content_raw = ""
|
||||||
|
for ind, line in enumerate(file_reader):
|
||||||
|
if isinstance(line, bytes):
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
line = str(line)
|
||||||
|
|
||||||
|
if ind == 0 and line.startswith(_METADATA_FLAG):
|
||||||
|
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
|
||||||
|
else:
|
||||||
|
file_content_raw += line
|
||||||
|
|
||||||
|
return file_content_raw, metadata
|
@@ -0,0 +1,57 @@
|
|||||||
|
from copy import copy
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||||
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||||
|
from danswer.utils.text_processing import format_document_soup
|
||||||
|
|
||||||
|
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParsedHTML:
|
||||||
|
title: str | None
|
||||||
|
cleaned_text: str
|
||||||
|
|
||||||
|
|
||||||
|
def standard_html_cleanup(
|
||||||
|
page_content: str | BeautifulSoup,
|
||||||
|
mintlify_cleanup_enabled: bool = True,
|
||||||
|
additional_element_types_to_discard: list[str] | None = None,
|
||||||
|
) -> ParsedHTML:
|
||||||
|
if isinstance(page_content, str):
|
||||||
|
soup = BeautifulSoup(page_content, "html.parser")
|
||||||
|
else:
|
||||||
|
soup = page_content
|
||||||
|
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
title = None
|
||||||
|
if title_tag and title_tag.text:
|
||||||
|
title = title_tag.text
|
||||||
|
title_tag.extract()
|
||||||
|
|
||||||
|
# Heuristics based cleaning of elements based on css classes
|
||||||
|
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
|
||||||
|
if mintlify_cleanup_enabled:
|
||||||
|
unwanted_classes.extend(MINTLIFY_UNWANTED)
|
||||||
|
for undesired_element in unwanted_classes:
|
||||||
|
[
|
||||||
|
tag.extract()
|
||||||
|
for tag in soup.find_all(
|
||||||
|
class_=lambda x: x and undesired_element in x.split()
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||||
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
|
if additional_element_types_to_discard:
|
||||||
|
for undesired_tag in additional_element_types_to_discard:
|
||||||
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
|
# 200B is ZeroWidthSpace which we don't care for
|
||||||
|
page_text = format_document_soup(soup).replace("\u200B", "")
|
||||||
|
|
||||||
|
return ParsedHTML(title=title, cleaned_text=page_text)
|
@@ -9,6 +9,7 @@ from danswer.connectors.file.connector import LocalFileConnector
|
|||||||
from danswer.connectors.github.connector import GithubConnector
|
from danswer.connectors.github.connector import GithubConnector
|
||||||
from danswer.connectors.gong.connector import GongConnector
|
from danswer.connectors.gong.connector import GongConnector
|
||||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||||
|
from danswer.connectors.google_site.connector import GoogleSitesConnector
|
||||||
from danswer.connectors.guru.connector import GuruConnector
|
from danswer.connectors.guru.connector import GuruConnector
|
||||||
from danswer.connectors.hubspot.connector import HubSpotConnector
|
from danswer.connectors.hubspot.connector import HubSpotConnector
|
||||||
from danswer.connectors.interfaces import BaseConnector
|
from danswer.connectors.interfaces import BaseConnector
|
||||||
@@ -54,6 +55,7 @@ def identify_connector_class(
|
|||||||
DocumentSource.LINEAR: LinearConnector,
|
DocumentSource.LINEAR: LinearConnector,
|
||||||
DocumentSource.HUBSPOT: HubSpotConnector,
|
DocumentSource.HUBSPOT: HubSpotConnector,
|
||||||
DocumentSource.GONG: GongConnector,
|
DocumentSource.GONG: GongConnector,
|
||||||
|
DocumentSource.GOOGLE_SITES: GoogleSitesConnector,
|
||||||
}
|
}
|
||||||
connector_by_source = connector_map.get(source, {})
|
connector_by_source = connector_map.get(source, {})
|
||||||
|
|
||||||
|
@@ -1,6 +1,4 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import zipfile
|
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -10,6 +8,8 @@ from PyPDF2 import PdfReader
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||||
from danswer.connectors.file.utils import get_file_ext
|
from danswer.connectors.file.utils import get_file_ext
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
@@ -21,17 +21,6 @@ from danswer.utils.logger import setup_logger
|
|||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
|
||||||
|
|
||||||
|
|
||||||
def _get_files_from_zip(
|
|
||||||
zip_location: str | Path,
|
|
||||||
) -> Generator[tuple[str, IO[Any]], None, None]:
|
|
||||||
with zipfile.ZipFile(zip_location, "r") as zip_file:
|
|
||||||
for file_name in zip_file.namelist():
|
|
||||||
with zip_file.open(file_name, "r") as file:
|
|
||||||
yield os.path.basename(file_name), file
|
|
||||||
|
|
||||||
|
|
||||||
def _open_files_at_location(
|
def _open_files_at_location(
|
||||||
file_path: str | Path,
|
file_path: str | Path,
|
||||||
@@ -39,7 +28,8 @@ def _open_files_at_location(
|
|||||||
extension = get_file_ext(file_path)
|
extension = get_file_ext(file_path)
|
||||||
|
|
||||||
if extension == ".zip":
|
if extension == ".zip":
|
||||||
yield from _get_files_from_zip(file_path)
|
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
|
||||||
|
yield file_info.filename, file
|
||||||
elif extension == ".txt" or extension == ".pdf":
|
elif extension == ".txt" or extension == ".pdf":
|
||||||
mode = "r"
|
mode = "r"
|
||||||
if extension == ".pdf":
|
if extension == ".pdf":
|
||||||
@@ -56,7 +46,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
|
|||||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
metadata = {}
|
metadata: dict[str, Any] = {}
|
||||||
file_content_raw = ""
|
file_content_raw = ""
|
||||||
if extension == ".pdf":
|
if extension == ".pdf":
|
||||||
pdf_reader = PdfReader(file)
|
pdf_reader = PdfReader(file)
|
||||||
@@ -65,15 +55,7 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
|
|||||||
page.extract_text() for page in pdf_reader.pages
|
page.extract_text() for page in pdf_reader.pages
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
for ind, line in enumerate(file):
|
file_content_raw, metadata = read_file(file)
|
||||||
if isinstance(line, bytes):
|
|
||||||
line = line.decode("utf-8")
|
|
||||||
line = str(line)
|
|
||||||
|
|
||||||
if ind == 0 and line.startswith(_METADATA_FLAG):
|
|
||||||
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
|
|
||||||
else:
|
|
||||||
file_content_raw += line
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
Document(
|
Document(
|
||||||
|
139
backend/danswer/connectors/google_site/connector.py
Normal file
139
backend/danswer/connectors/google_site/connector.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
import os
|
||||||
|
import urllib.parse
|
||||||
|
from typing import Any
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4 import Tag
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
||||||
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.connectors.models import Section
|
||||||
|
|
||||||
|
|
||||||
|
def process_link(element: BeautifulSoup | Tag) -> str:
|
||||||
|
href = cast(str | None, element.get("href"))
|
||||||
|
if not href:
|
||||||
|
raise RuntimeError(f"Invalid link - {element}")
|
||||||
|
|
||||||
|
# cleanup href
|
||||||
|
href = urllib.parse.unquote(href)
|
||||||
|
href = href.rstrip(".html").lower()
|
||||||
|
href = href.replace("_", "")
|
||||||
|
href = href.replace(" ", "-")
|
||||||
|
|
||||||
|
return href
|
||||||
|
|
||||||
|
|
||||||
|
def find_google_sites_page_path_from_navbar(
|
||||||
|
element: BeautifulSoup | Tag, path: str, is_initial: bool
|
||||||
|
) -> str | None:
|
||||||
|
ul = cast(Tag | None, element.find("ul"))
|
||||||
|
if ul:
|
||||||
|
if not is_initial:
|
||||||
|
a = cast(Tag, element.find("a"))
|
||||||
|
new_path = f"{path}/{process_link(a)}"
|
||||||
|
if a.get("aria-selected") == "true":
|
||||||
|
return new_path
|
||||||
|
else:
|
||||||
|
new_path = ""
|
||||||
|
for li in ul.find_all("li", recursive=False):
|
||||||
|
found_link = find_google_sites_page_path_from_navbar(li, new_path, False)
|
||||||
|
if found_link:
|
||||||
|
return found_link
|
||||||
|
else:
|
||||||
|
a = cast(Tag, element.find("a"))
|
||||||
|
if a:
|
||||||
|
href = process_link(a)
|
||||||
|
if href and a.get("aria-selected") == "true":
|
||||||
|
return path + "/" + href
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleSitesConnector(LoadConnector):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
zip_path: str,
|
||||||
|
base_url: str,
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
):
|
||||||
|
self.zip_path = zip_path
|
||||||
|
self.base_url = base_url
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
|
documents: list[Document] = []
|
||||||
|
|
||||||
|
# load the HTML files
|
||||||
|
files = load_files_from_zip(self.zip_path)
|
||||||
|
for file_info, file_io in files:
|
||||||
|
# skip non-published files
|
||||||
|
if "/PUBLISHED/" not in file_info.filename:
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_path, extension = os.path.splitext(file_info.filename)
|
||||||
|
if extension != ".html":
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_content, _ = read_file(file_io)
|
||||||
|
soup = BeautifulSoup(file_content, "html.parser")
|
||||||
|
|
||||||
|
# get the link out of the navbar
|
||||||
|
header = cast(Tag, soup.find("header"))
|
||||||
|
nav = cast(Tag, header.find("nav"))
|
||||||
|
path = find_google_sites_page_path_from_navbar(nav, "", True)
|
||||||
|
if not path:
|
||||||
|
raise RuntimeError(f"Could not find path for {file_info.filename}")
|
||||||
|
|
||||||
|
# cleanup the hidden `Skip to main content` and `Skip to navigation` that
|
||||||
|
# appears at the top of every page
|
||||||
|
for div in soup.find_all("div", attrs={"data-is-touch-wrapper": "true"}):
|
||||||
|
div.extract()
|
||||||
|
|
||||||
|
# get the body of the page
|
||||||
|
parsed_html = standard_html_cleanup(
|
||||||
|
soup, additional_element_types_to_discard=["header", "nav"]
|
||||||
|
)
|
||||||
|
|
||||||
|
title = parsed_html.title or file_path.split("/")[-1]
|
||||||
|
documents.append(
|
||||||
|
Document(
|
||||||
|
id=f"{DocumentSource.GOOGLE_SITES.value}:{path}",
|
||||||
|
source=DocumentSource.GOOGLE_SITES,
|
||||||
|
semantic_identifier=title,
|
||||||
|
sections=[
|
||||||
|
Section(
|
||||||
|
link=self.base_url.rstrip("/") + "/" + path.lstrip("/"),
|
||||||
|
text=parsed_html.cleaned_text,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(documents) >= self.batch_size:
|
||||||
|
yield documents
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
if documents:
|
||||||
|
yield documents
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
connector = GoogleSitesConnector(
|
||||||
|
os.environ["GOOGLE_SITES_ZIP_PATH"],
|
||||||
|
os.environ.get("GOOGLE_SITES_BASE_URL", ""),
|
||||||
|
)
|
||||||
|
for doc_batch in connector.load_from_state():
|
||||||
|
for doc in doc_batch:
|
||||||
|
print(doc)
|
@@ -1,5 +1,4 @@
|
|||||||
import io
|
import io
|
||||||
from copy import copy
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -18,25 +17,20 @@ from PyPDF2 import PdfReader
|
|||||||
from requests_oauthlib import OAuth2Session # type:ignore
|
from requests_oauthlib import OAuth2Session # type:ignore
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.text_processing import format_document_soup
|
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
|
||||||
|
|
||||||
|
|
||||||
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
||||||
# Given a base site, index everything under that path
|
# Given a base site, index everything under that path
|
||||||
RECURSIVE = "recursive"
|
RECURSIVE = "recursive"
|
||||||
@@ -224,36 +218,16 @@ class WebConnector(LoadConnector):
|
|||||||
if link not in visited_links:
|
if link not in visited_links:
|
||||||
to_visit.append(link)
|
to_visit.append(link)
|
||||||
|
|
||||||
title_tag = soup.find("title")
|
parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
|
||||||
title = None
|
|
||||||
if title_tag and title_tag.text:
|
|
||||||
title = title_tag.text
|
|
||||||
title_tag.extract()
|
|
||||||
|
|
||||||
# Heuristics based cleaning of elements based on css classes
|
|
||||||
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
|
|
||||||
if self.mintlify_cleanup:
|
|
||||||
unwanted_classes.extend(MINTLIFY_UNWANTED)
|
|
||||||
for undesired_element in unwanted_classes:
|
|
||||||
[
|
|
||||||
tag.extract()
|
|
||||||
for tag in soup.find_all(
|
|
||||||
class_=lambda x: x and undesired_element in x.split()
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
|
||||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
|
||||||
|
|
||||||
# 200B is ZeroWidthSpace which we don't care for
|
|
||||||
page_text = format_document_soup(soup).replace("\u200B", "")
|
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
id=current_url,
|
id=current_url,
|
||||||
sections=[Section(link=current_url, text=page_text)],
|
sections=[
|
||||||
|
Section(link=current_url, text=parsed_html.cleaned_text)
|
||||||
|
],
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=title or current_url,
|
semantic_identifier=parsed_html.title or current_url,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
BIN
web/public/GoogleSites.png
Normal file
BIN
web/public/GoogleSites.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.4 KiB |
@@ -8,7 +8,6 @@ import { fetcher } from "@/lib/fetcher";
|
|||||||
import { HealthCheckBanner } from "@/components/health/healthcheck";
|
import { HealthCheckBanner } from "@/components/health/healthcheck";
|
||||||
import { ConnectorIndexingStatus, FileConfig } from "@/lib/types";
|
import { ConnectorIndexingStatus, FileConfig } from "@/lib/types";
|
||||||
import { linkCredential } from "@/lib/credential";
|
import { linkCredential } from "@/lib/credential";
|
||||||
import { FileUpload } from "./FileUpload";
|
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||||
import { createConnector, runConnector } from "@/lib/connector";
|
import { createConnector, runConnector } from "@/lib/connector";
|
||||||
@@ -17,6 +16,7 @@ import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/Si
|
|||||||
import { LoadingAnimation } from "@/components/Loading";
|
import { LoadingAnimation } from "@/components/Loading";
|
||||||
import { Form, Formik } from "formik";
|
import { Form, Formik } from "formik";
|
||||||
import { TextFormField } from "@/components/admin/connectors/Field";
|
import { TextFormField } from "@/components/admin/connectors/Field";
|
||||||
|
import { FileUpload } from "@/components/admin/connectors/FileUpload";
|
||||||
|
|
||||||
const getNameFromPath = (path: string) => {
|
const getNameFromPath = (path: string) => {
|
||||||
const pathParts = path.split("/");
|
const pathParts = path.split("/");
|
||||||
|
241
web/src/app/admin/connectors/google-sites/page.tsx
Normal file
241
web/src/app/admin/connectors/google-sites/page.tsx
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
"use client";
|
||||||
|
|
||||||
|
import useSWR, { useSWRConfig } from "swr";
|
||||||
|
import * as Yup from "yup";
|
||||||
|
|
||||||
|
import { LoadingAnimation } from "@/components/Loading";
|
||||||
|
import { GoogleSitesIcon } from "@/components/icons/icons";
|
||||||
|
import { fetcher } from "@/lib/fetcher";
|
||||||
|
import { TextFormField } from "@/components/admin/connectors/Field";
|
||||||
|
import { HealthCheckBanner } from "@/components/health/healthcheck";
|
||||||
|
import { ConnectorIndexingStatus, GoogleSitesConfig } from "@/lib/types";
|
||||||
|
import { Form, Formik } from "formik";
|
||||||
|
import { useState } from "react";
|
||||||
|
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||||
|
import { createConnector, runConnector } from "@/lib/connector";
|
||||||
|
import { linkCredential } from "@/lib/credential";
|
||||||
|
import { FileUpload } from "@/components/admin/connectors/FileUpload";
|
||||||
|
import { SingleUseConnectorsTable } from "@/components/admin/connectors/table/SingleUseConnectorsTable";
|
||||||
|
import { Spinner } from "@/components/Spinner";
|
||||||
|
|
||||||
|
export default function GoogleSites() {
|
||||||
|
const { mutate } = useSWRConfig();
|
||||||
|
const [selectedFiles, setSelectedFiles] = useState<File[]>([]);
|
||||||
|
const [filesAreUploading, setFilesAreUploading] = useState<boolean>(false);
|
||||||
|
const { popup, setPopup } = usePopup();
|
||||||
|
|
||||||
|
const {
|
||||||
|
data: connectorIndexingStatuses,
|
||||||
|
isLoading: isConnectorIndexingStatusesLoading,
|
||||||
|
error: isConnectorIndexingStatusesError,
|
||||||
|
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
|
||||||
|
"/api/manage/admin/connector/indexing-status",
|
||||||
|
fetcher
|
||||||
|
);
|
||||||
|
|
||||||
|
const googleSitesIndexingStatuses: ConnectorIndexingStatus<
|
||||||
|
GoogleSitesConfig,
|
||||||
|
{}
|
||||||
|
>[] =
|
||||||
|
connectorIndexingStatuses?.filter(
|
||||||
|
(connectorIndexingStatus) =>
|
||||||
|
connectorIndexingStatus.connector.source === "google_sites"
|
||||||
|
) ?? [];
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
{popup}
|
||||||
|
{filesAreUploading && <Spinner />}
|
||||||
|
<div className="mx-auto container">
|
||||||
|
<div className="mb-4">
|
||||||
|
<HealthCheckBanner />
|
||||||
|
</div>
|
||||||
|
<div className="border-solid border-gray-600 border-b pb-2 mb-4 flex">
|
||||||
|
<GoogleSitesIcon size={32} />
|
||||||
|
<h1 className="text-3xl font-bold pl-2">Google Sites</h1>
|
||||||
|
</div>
|
||||||
|
<p className="text-sm mb-2">
|
||||||
|
For an in-depth guide on how to setup this connector, check out{" "}
|
||||||
|
<a
|
||||||
|
href="https://docs.danswer.dev/connectors/google-sites"
|
||||||
|
target="_blank"
|
||||||
|
className="text-blue-500"
|
||||||
|
>
|
||||||
|
the documentation
|
||||||
|
</a>
|
||||||
|
.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div className="mt-4">
|
||||||
|
<h2 className="font-bold text-xl mb-2">Upload Files</h2>
|
||||||
|
<div className="mx-auto w-full">
|
||||||
|
<Formik
|
||||||
|
initialValues={{
|
||||||
|
base_url: "",
|
||||||
|
}}
|
||||||
|
validationSchema={Yup.object().shape({
|
||||||
|
base_url: Yup.string().required("Base URL is required"),
|
||||||
|
})}
|
||||||
|
onSubmit={async (values, formikHelpers) => {
|
||||||
|
const uploadCreateAndTriggerConnector = async () => {
|
||||||
|
const formData = new FormData();
|
||||||
|
|
||||||
|
selectedFiles.forEach((file) => {
|
||||||
|
formData.append("files", file);
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await fetch(
|
||||||
|
"/api/manage/admin/connector/file/upload",
|
||||||
|
{ method: "POST", body: formData }
|
||||||
|
);
|
||||||
|
const responseJson = await response.json();
|
||||||
|
if (!response.ok) {
|
||||||
|
setPopup({
|
||||||
|
message: `Unable to upload files - ${responseJson.detail}`,
|
||||||
|
type: "error",
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const filePaths = responseJson.file_paths as string[];
|
||||||
|
const [connectorErrorMsg, connector] =
|
||||||
|
await createConnector<GoogleSitesConfig>({
|
||||||
|
name: `GoogleSitesConnector-${values.base_url}`,
|
||||||
|
source: "google_sites",
|
||||||
|
input_type: "load_state",
|
||||||
|
connector_specific_config: {
|
||||||
|
base_url: values.base_url,
|
||||||
|
zip_path: filePaths[0],
|
||||||
|
},
|
||||||
|
refresh_freq: null,
|
||||||
|
disabled: false,
|
||||||
|
});
|
||||||
|
if (connectorErrorMsg || !connector) {
|
||||||
|
setPopup({
|
||||||
|
message: `Unable to create connector - ${connectorErrorMsg}`,
|
||||||
|
type: "error",
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const credentialResponse = await linkCredential(
|
||||||
|
connector.id,
|
||||||
|
0,
|
||||||
|
values.base_url
|
||||||
|
);
|
||||||
|
if (!credentialResponse.ok) {
|
||||||
|
const credentialResponseJson =
|
||||||
|
await credentialResponse.json();
|
||||||
|
setPopup({
|
||||||
|
message: `Unable to link connector to credential - ${credentialResponseJson.detail}`,
|
||||||
|
type: "error",
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const runConnectorErrorMsg = await runConnector(
|
||||||
|
connector.id,
|
||||||
|
[0]
|
||||||
|
);
|
||||||
|
if (runConnectorErrorMsg) {
|
||||||
|
setPopup({
|
||||||
|
message: `Unable to run connector - ${runConnectorErrorMsg}`,
|
||||||
|
type: "error",
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status");
|
||||||
|
setSelectedFiles([]);
|
||||||
|
formikHelpers.resetForm();
|
||||||
|
setPopup({
|
||||||
|
type: "success",
|
||||||
|
message: "Successfully uploaded files!",
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
setFilesAreUploading(true);
|
||||||
|
try {
|
||||||
|
await uploadCreateAndTriggerConnector();
|
||||||
|
} catch (e) {
|
||||||
|
console.log("Failed to index filels: ", e);
|
||||||
|
}
|
||||||
|
setFilesAreUploading(false);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{({ values, isSubmitting }) => (
|
||||||
|
<Form className="p-3 border border-gray-600 rounded">
|
||||||
|
<TextFormField
|
||||||
|
name="base_url"
|
||||||
|
label="Base URL:"
|
||||||
|
placeholder={`Base URL of your Google Site e.g. https://sites.google.com/view/your-site`}
|
||||||
|
subtext="This will be used to generate links for each page."
|
||||||
|
autoCompleteDisabled={true}
|
||||||
|
/>
|
||||||
|
|
||||||
|
<p className="mb-1 font-medium">Files:</p>
|
||||||
|
<FileUpload
|
||||||
|
selectedFiles={selectedFiles}
|
||||||
|
setSelectedFiles={setSelectedFiles}
|
||||||
|
message="Upload a zip file containing the HTML of your Google Site"
|
||||||
|
/>
|
||||||
|
<button
|
||||||
|
className={
|
||||||
|
"bg-slate-500 hover:bg-slate-700 text-white " +
|
||||||
|
"font-bold py-2 px-4 rounded focus:outline-none " +
|
||||||
|
"focus:shadow-outline w-full mx-auto mt-4"
|
||||||
|
}
|
||||||
|
type="submit"
|
||||||
|
disabled={
|
||||||
|
selectedFiles.length !== 1 ||
|
||||||
|
!values.base_url ||
|
||||||
|
isSubmitting
|
||||||
|
}
|
||||||
|
>
|
||||||
|
Upload!
|
||||||
|
</button>
|
||||||
|
</Form>
|
||||||
|
)}
|
||||||
|
</Formik>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h2 className="font-bold mb-2 mt-6 ml-auto mr-auto">
|
||||||
|
Existing Google Site Connectors
|
||||||
|
</h2>
|
||||||
|
{isConnectorIndexingStatusesLoading ? (
|
||||||
|
<LoadingAnimation text="Loading" />
|
||||||
|
) : isConnectorIndexingStatusesError || !connectorIndexingStatuses ? (
|
||||||
|
<div>Error loading indexing history</div>
|
||||||
|
) : googleSitesIndexingStatuses.length > 0 ? (
|
||||||
|
<SingleUseConnectorsTable<GoogleSitesConfig, {}>
|
||||||
|
connectorIndexingStatuses={googleSitesIndexingStatuses}
|
||||||
|
specialColumns={[
|
||||||
|
{
|
||||||
|
header: "Base URL",
|
||||||
|
key: "base_url",
|
||||||
|
getValue: (ccPairStatus) => {
|
||||||
|
const connectorConfig =
|
||||||
|
ccPairStatus.connector.connector_specific_config;
|
||||||
|
return (
|
||||||
|
<a
|
||||||
|
className="text-blue-500"
|
||||||
|
href={connectorConfig.base_url}
|
||||||
|
>
|
||||||
|
{connectorConfig.base_url}
|
||||||
|
</a>
|
||||||
|
);
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
onUpdate={() =>
|
||||||
|
mutate("/api/manage/admin/connector/indexing-status")
|
||||||
|
}
|
||||||
|
/>
|
||||||
|
) : (
|
||||||
|
<p className="text-sm">No indexed Google Sites found</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
@@ -22,6 +22,7 @@ import {
|
|||||||
HubSpotIcon,
|
HubSpotIcon,
|
||||||
BookmarkIcon,
|
BookmarkIcon,
|
||||||
CPUIcon,
|
CPUIcon,
|
||||||
|
GoogleSitesIcon,
|
||||||
} from "@/components/icons/icons";
|
} from "@/components/icons/icons";
|
||||||
import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS";
|
import { getAuthDisabledSS, getCurrentUserSS } from "@/lib/userSS";
|
||||||
import { redirect } from "next/navigation";
|
import { redirect } from "next/navigation";
|
||||||
@@ -173,6 +174,15 @@ export async function Layout({ children }: { children: React.ReactNode }) {
|
|||||||
),
|
),
|
||||||
link: "/admin/connectors/zulip",
|
link: "/admin/connectors/zulip",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: (
|
||||||
|
<div className="flex">
|
||||||
|
<GoogleSitesIcon size={16} />
|
||||||
|
<div className="ml-1">Google Sites</div>
|
||||||
|
</div>
|
||||||
|
),
|
||||||
|
link: "/admin/connectors/google-sites",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: (
|
name: (
|
||||||
<div className="flex">
|
<div className="flex">
|
||||||
|
@@ -1,16 +1,17 @@
|
|||||||
// components/FileUpload.tsx
|
import { FC, useState } from "react";
|
||||||
import { ChangeEvent, FC, useState } from "react";
|
|
||||||
import React from "react";
|
import React from "react";
|
||||||
import Dropzone from "react-dropzone";
|
import Dropzone from "react-dropzone";
|
||||||
|
|
||||||
interface FileUploadProps {
|
interface FileUploadProps {
|
||||||
selectedFiles: File[];
|
selectedFiles: File[];
|
||||||
setSelectedFiles: (files: File[]) => void;
|
setSelectedFiles: (files: File[]) => void;
|
||||||
|
message?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const FileUpload: FC<FileUploadProps> = ({
|
export const FileUpload: FC<FileUploadProps> = ({
|
||||||
selectedFiles,
|
selectedFiles,
|
||||||
setSelectedFiles,
|
setSelectedFiles,
|
||||||
|
message,
|
||||||
}) => {
|
}) => {
|
||||||
const [dragActive, setDragActive] = useState(false);
|
const [dragActive, setDragActive] = useState(false);
|
||||||
|
|
||||||
@@ -35,7 +36,10 @@ export const FileUpload: FC<FileUploadProps> = ({
|
|||||||
}
|
}
|
||||||
>
|
>
|
||||||
<input {...getInputProps()} />
|
<input {...getInputProps()} />
|
||||||
<b>Drag and drop some files here, or click to select files</b>
|
<b>
|
||||||
|
{message ||
|
||||||
|
"Drag and drop some files here, or click to select files"}
|
||||||
|
</b>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
)}
|
)}
|
@@ -1,10 +1,4 @@
|
|||||||
import {
|
import { DeletionAttemptSnapshot, ValidStatuses } from "@/lib/types";
|
||||||
Connector,
|
|
||||||
ConnectorIndexingStatus,
|
|
||||||
Credential,
|
|
||||||
DeletionAttemptSnapshot,
|
|
||||||
ValidStatuses,
|
|
||||||
} from "@/lib/types";
|
|
||||||
import { BasicTable } from "@/components/admin/connectors/BasicTable";
|
import { BasicTable } from "@/components/admin/connectors/BasicTable";
|
||||||
import { Popup } from "@/components/admin/connectors/Popup";
|
import { Popup } from "@/components/admin/connectors/Popup";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
@@ -64,17 +58,19 @@ export function SingleUseConnectorsTable<
|
|||||||
const connectorIncludesCredential =
|
const connectorIncludesCredential =
|
||||||
getCredential !== undefined && onCredentialLink !== undefined;
|
getCredential !== undefined && onCredentialLink !== undefined;
|
||||||
|
|
||||||
const columns = [
|
const columns = [];
|
||||||
{
|
|
||||||
|
if (includeName) {
|
||||||
|
columns.push({
|
||||||
header: "Name",
|
header: "Name",
|
||||||
key: "name",
|
key: "name",
|
||||||
},
|
});
|
||||||
...(specialColumns ?? []),
|
}
|
||||||
{
|
columns.push(...(specialColumns ?? []));
|
||||||
header: "Status",
|
columns.push({
|
||||||
key: "status",
|
header: "Status",
|
||||||
},
|
key: "status",
|
||||||
];
|
});
|
||||||
if (connectorIncludesCredential) {
|
if (connectorIncludesCredential) {
|
||||||
columns.push({
|
columns.push({
|
||||||
header: "Credential",
|
header: "Credential",
|
||||||
|
@@ -43,6 +43,7 @@ import gongIcon from "../../../public/Gong.png";
|
|||||||
import zulipIcon from "../../../public/Zulip.png";
|
import zulipIcon from "../../../public/Zulip.png";
|
||||||
import linearIcon from "../../../public/Linear.png";
|
import linearIcon from "../../../public/Linear.png";
|
||||||
import hubSpotIcon from "../../../public/HubSpot.png";
|
import hubSpotIcon from "../../../public/HubSpot.png";
|
||||||
|
import googleSitesIcon from "../../../public/GoogleSites.png";
|
||||||
|
|
||||||
interface IconProps {
|
interface IconProps {
|
||||||
size?: number;
|
size?: number;
|
||||||
@@ -450,3 +451,17 @@ export const HubSpotIcon = ({
|
|||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const GoogleSitesIcon = ({
|
||||||
|
size = 16,
|
||||||
|
className = defaultTailwindCSS,
|
||||||
|
}: IconProps) => {
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
style={{ width: `${size}px`, height: `${size}px` }}
|
||||||
|
className={`w-[${size}px] h-[${size}px] ` + className}
|
||||||
|
>
|
||||||
|
<Image src={googleSitesIcon} alt="Logo" width="96" height="96" />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
@@ -29,6 +29,7 @@ const sources: Source[] = [
|
|||||||
{ displayName: "Zulip", internalName: "zulip" },
|
{ displayName: "Zulip", internalName: "zulip" },
|
||||||
{ displayName: "Linear", internalName: "linear" },
|
{ displayName: "Linear", internalName: "linear" },
|
||||||
{ displayName: "HubSpot", internalName: "hubspot" },
|
{ displayName: "HubSpot", internalName: "hubspot" },
|
||||||
|
{ displayName: "Google Sites", internalName: "google_sites" },
|
||||||
];
|
];
|
||||||
|
|
||||||
interface SourceSelectorProps {
|
interface SourceSelectorProps {
|
||||||
|
@@ -16,6 +16,7 @@ import {
|
|||||||
SlackIcon,
|
SlackIcon,
|
||||||
ZulipIcon,
|
ZulipIcon,
|
||||||
HubSpotIcon,
|
HubSpotIcon,
|
||||||
|
GoogleSitesIcon,
|
||||||
} from "./icons/icons";
|
} from "./icons/icons";
|
||||||
|
|
||||||
interface SourceMetadata {
|
interface SourceMetadata {
|
||||||
@@ -122,6 +123,12 @@ export const getSourceMetadata = (sourceType: ValidSources): SourceMetadata => {
|
|||||||
displayName: "HubSpot",
|
displayName: "HubSpot",
|
||||||
adminPageLink: "/admin/connectors/hubspot",
|
adminPageLink: "/admin/connectors/hubspot",
|
||||||
};
|
};
|
||||||
|
case "google_sites":
|
||||||
|
return {
|
||||||
|
icon: GoogleSitesIcon,
|
||||||
|
displayName: "Google Sites",
|
||||||
|
adminPageLink: "/admin/connectors/google-sites",
|
||||||
|
};
|
||||||
default:
|
default:
|
||||||
throw new Error("Invalid source type");
|
throw new Error("Invalid source type");
|
||||||
}
|
}
|
||||||
|
@@ -23,7 +23,8 @@ export type ValidSources =
|
|||||||
| "zulip"
|
| "zulip"
|
||||||
| "linear"
|
| "linear"
|
||||||
| "hubspot"
|
| "hubspot"
|
||||||
| "file";
|
| "file"
|
||||||
|
| "google_sites";
|
||||||
export type ValidInputTypes = "load_state" | "poll" | "event";
|
export type ValidInputTypes = "load_state" | "poll" | "event";
|
||||||
export type ValidStatuses =
|
export type ValidStatuses =
|
||||||
| "success"
|
| "success"
|
||||||
@@ -114,6 +115,11 @@ export interface NotionConfig {}
|
|||||||
|
|
||||||
export interface HubSpotConfig {}
|
export interface HubSpotConfig {}
|
||||||
|
|
||||||
|
export interface GoogleSitesConfig {
|
||||||
|
zip_path: string;
|
||||||
|
base_url: string;
|
||||||
|
}
|
||||||
|
|
||||||
export interface IndexAttemptSnapshot {
|
export interface IndexAttemptSnapshot {
|
||||||
status: ValidStatuses | null;
|
status: ValidStatuses | null;
|
||||||
num_docs_indexed: number;
|
num_docs_indexed: number;
|
||||||
|
Reference in New Issue
Block a user