From 3b1a8274a9b3365ef1dc78a98138f6edc3aca341 Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Mon, 17 Jul 2023 14:51:16 -0700 Subject: [PATCH] Allow specification of specific google drive folders to index (#197) --- .../connectors/google_drive/connector.py | 97 +++++- .../admin/connectors/google-drive/page.tsx | 290 ++++-------------- web/src/app/admin/connectors/slack/page.tsx | 1 - web/src/components/Button.tsx | 5 +- .../admin/connectors/ConnectorForm.tsx | 34 +- web/src/components/admin/connectors/Field.tsx | 72 ++++- web/src/components/admin/connectors/types.ts | 13 + web/src/lib/connector.ts | 6 +- web/src/lib/types.ts | 4 + 9 files changed, 278 insertions(+), 244 deletions(-) create mode 100644 web/src/components/admin/connectors/types.ts diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index c0ae51860..e61a61137 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,6 +1,7 @@ import datetime import io from collections.abc import Generator +from itertools import chain from typing import Any from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED @@ -23,25 +24,49 @@ from PyPDF2 import PdfReader logger = setup_logger() -SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] +SCOPES = [ + "https://www.googleapis.com/auth/drive.readonly", + "https://www.googleapis.com/auth/drive.metadata.readonly", +] SUPPORTED_DRIVE_DOC_TYPES = [ "application/vnd.google-apps.document", "application/pdf", "application/vnd.google-apps.spreadsheet", ] +DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" ID_KEY = "id" LINK_KEY = "link" TYPE_KEY = "type" +def get_folder_id( + service: discovery.Resource, parent_id: str, folder_name: str +) -> str | None: + """ + Get the ID of a folder given its name and the ID of its parent folder. + """ + query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'" + results = ( + service.files() + .list(q=query, spaces="drive", fields="nextPageToken, files(id, name)") + .execute() + ) + items = results.get("files", []) + return items[0]["id"] if items else None + + def get_file_batches( service: discovery.Resource, include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, batch_size: int = INDEX_BATCH_SIZE, time_range_start: SecondsSinceUnixEpoch | None = None, time_range_end: SecondsSinceUnixEpoch | None = None, + folder_id: str | None = None, # if specified, only fetches files within this folder + # if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified. + traverse_subfolders: bool = True, ) -> Generator[list[dict[str, str]], None, None]: next_page_token = "" + subfolders: list[dict[str, str]] = [] while next_page_token is not None: query = "" if time_range_start is not None: @@ -53,7 +78,10 @@ def get_file_batches( time_stop = ( datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z" ) - query += f"and modifiedTime <= '{time_stop}'" + query += f"and modifiedTime <= '{time_stop}' " + if folder_id: + query += f"and '{folder_id}' in parents " + query = query.rstrip() # remove the trailing space(s) results = ( service.files() @@ -69,14 +97,30 @@ def get_file_batches( next_page_token = results.get("nextPageToken") files = results["files"] valid_files: list[dict[str, str]] = [] + for file in files: if file["mimeType"] in SUPPORTED_DRIVE_DOC_TYPES: valid_files.append(file) + elif file["mimeType"] == DRIVE_FOLDER_TYPE: + subfolders.append(file) logger.info( f"Parseable Documents in batch: {[file['name'] for file in valid_files]}" ) yield valid_files + if traverse_subfolders: + for subfolder in subfolders: + logger.info("Fetching all files in subfolder: " + subfolder["name"]) + yield from get_file_batches( + service=service, + include_shared=include_shared, + batch_size=batch_size, + time_range_start=time_range_start, + time_range_end=time_range_end, + folder_id=subfolder["id"], + traverse_subfolders=traverse_subfolders, + ) + def extract_text(file: dict[str, str], service: discovery.Resource) -> str: mime_type = file["mimeType"] @@ -105,13 +149,36 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: class GoogleDriveConnector(LoadConnector, PollConnector): def __init__( self, + # optional list of folder paths e.g. "[My Folder/My Subfolder]" + # if specified, will only index files in these folders + folder_paths: list[str] | None = None, batch_size: int = INDEX_BATCH_SIZE, include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, ) -> None: + self.folder_paths = folder_paths or [] self.batch_size = batch_size self.include_shared = include_shared self.creds: Credentials | None = None + @staticmethod + def _process_folder_paths( + service: discovery.Resource, folder_paths: list[str] + ) -> list[str]: + """['Folder/Sub Folder'] -> ['']""" + folder_ids: list[str] = [] + for path in folder_paths: + folder_names = path.split("/") + parent_id = "root" + for folder_name in folder_names: + parent_id = get_folder_id( + service=service, parent_id=parent_id, folder_name=folder_name + ) + if parent_id is None: + raise ValueError(f"Folder path '{path}' not found in Google Drive") + folder_ids.append(parent_id) + + return folder_ids + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: access_token_json_str = credentials[DB_CREDENTIALS_DICT_KEY] creds = get_drive_tokens(token_json_str=access_token_json_str) @@ -132,13 +199,25 @@ class GoogleDriveConnector(LoadConnector, PollConnector): raise PermissionError("Not logged into Google Drive") service = discovery.build("drive", "v3", credentials=self.creds) - for files_batch in get_file_batches( - service, - self.include_shared, - self.batch_size, - time_range_start=start, - time_range_end=end, - ): + folder_ids = self._process_folder_paths(service, self.folder_paths) + if not folder_ids: + folder_ids = [None] + + file_batches = chain( + *[ + get_file_batches( + service=service, + include_shared=self.include_shared, + batch_size=self.batch_size, + time_range_start=start, + time_range_end=end, + folder_id=folder_id, + traverse_subfolders=True, + ) + for folder_id in folder_ids + ] + ) + for files_batch in file_batches: doc_batch = [] for file in files_batch: text_contents = extract_text(file, service) diff --git a/web/src/app/admin/connectors/google-drive/page.tsx b/web/src/app/admin/connectors/google-drive/page.tsx index 990b540c7..404ff5e2c 100644 --- a/web/src/app/admin/connectors/google-drive/page.tsx +++ b/web/src/app/admin/connectors/google-drive/page.tsx @@ -1,5 +1,6 @@ "use client"; +import * as Yup from "yup"; import { GoogleDriveIcon } from "@/components/icons/icons"; import useSWR, { useSWRConfig } from "swr"; import { fetcher } from "@/lib/fetcher"; @@ -10,18 +11,19 @@ import { useState } from "react"; import { HealthCheckBanner } from "@/components/health/healthcheck"; import { Button } from "@/components/Button"; import { - Connector, - ConnectorBase, ConnectorIndexingStatus, Credential, + GoogleDriveConfig, GoogleDriveCredentialJson, } from "@/lib/types"; -import { deleteConnector, deleteConnectorIfExists } from "@/lib/connector"; +import { deleteConnector } from "@/lib/connector"; import { StatusRow } from "@/components/admin/connectors/table/ConnectorsTable"; import { setupGoogleDriveOAuth } from "@/lib/googleDrive"; import Cookies from "js-cookie"; import { GOOGLE_DRIVE_AUTH_IS_ADMIN_COOKIE_NAME } from "@/lib/constants"; import { deleteCredential, linkCredential } from "@/lib/credential"; +import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm"; +import { TextArrayFieldBuilder } from "@/components/admin/connectors/Field"; const AppCredentialUpload = ({ setPopup, @@ -97,7 +99,7 @@ interface GoogleDriveConnectorManagementProps { googleDrivePublicCredential: | Credential | undefined; - googleDriveConnectorIndexingStatus: ConnectorIndexingStatus<{}> | null; + googleDriveConnectorIndexingStatus: ConnectorIndexingStatus | null; credentialIsLinked: boolean; setPopup: (popupSpec: PopupSpec | null) => void; } @@ -125,79 +127,50 @@ const GoogleDriveConnectorManagement = ({ return ( <>

- Click the button below to create a connector. We will refresh the + Fill out the form below to create a connector. We will refresh the latest documents from Google Drive every 10 minutes.

- + }} + /> + ); } @@ -231,8 +204,8 @@ const GoogleDriveConnectorManagement = ({ return (
-
-
+
+
The Google Drive connector is setup! Status:{" "}
-

+ {googleDriveConnectorIndexingStatus.connector.connector_specific_config + .folder_paths.length > 0 && ( +

+ It is setup to index the following folders:{" "} +
+ {googleDriveConnectorIndexingStatus.connector.connector_specific_config.folder_paths.map( + (path) => ( +
+ - {path} +
+ ) + )} +
+
+ )} +

Checkout the{" "} status page @@ -274,158 +262,12 @@ const GoogleDriveConnectorManagement = ({ } ); }} + className="mt-2" > Delete Connector

); - - // return ( - // <> - // {googleDrivePublicCredential ? ( - // googleDriveConnectorIndexingStatus ? ( - // credentialIsLinked ? ( - //
- // - // - //
- // ) : ( - // <> - //

- // Click the button below to link your credentials! Once this is - // done, all public documents in your Google Drive will be - // searchable. We will refresh the latest documents every 10{" "} - // minutes. - //

- // - // - // ) - // ) : ( - // <> - //

- // Click the button below to create a connector. We will refresh the - // latest documents from Google Drive every 10 minutes. - //

- // - // - // ) - // ) : ( - //

- // Please authenticate with Google Drive as described in Step 2! Once - // done with that, you can then move on to enable this connector. - //

- // )} - // - // ); }; const Main = () => { @@ -510,7 +352,7 @@ const Main = () => { (credential) => credential.credential_json?.google_drive_tokens && credential.public_doc ); - const googleDriveConnectorIndexingStatuses: ConnectorIndexingStatus<{}>[] = + const googleDriveConnectorIndexingStatuses: ConnectorIndexingStatus[] = connectorIndexingStatuses.filter( (connectorIndexingStatus) => connectorIndexingStatus.connector.source === "google_drive" diff --git a/web/src/app/admin/connectors/slack/page.tsx b/web/src/app/admin/connectors/slack/page.tsx index c621103c9..99b71f543 100644 --- a/web/src/app/admin/connectors/slack/page.tsx +++ b/web/src/app/admin/connectors/slack/page.tsx @@ -7,7 +7,6 @@ import useSWR, { useSWRConfig } from "swr"; import { LoadingAnimation } from "@/components/Loading"; import { HealthCheckBanner } from "@/components/health/healthcheck"; import { - Connector, SlackConfig, Credential, SlackCredentialJson, diff --git a/web/src/components/Button.tsx b/web/src/components/Button.tsx index 03ba92a1d..651bf340a 100644 --- a/web/src/components/Button.tsx +++ b/web/src/components/Button.tsx @@ -1,6 +1,7 @@ interface Props { children: JSX.Element | string; - onClick?: () => void; + onClick?: React.MouseEventHandler; + type?: "button" | "submit" | "reset"; disabled?: boolean; fullWidth?: boolean; className?: string; @@ -9,6 +10,7 @@ interface Props { export const Button = ({ children, onClick, + type = "submit", disabled = false, fullWidth = false, className = "", @@ -26,6 +28,7 @@ export const Button = ({ className } onClick={onClick} + type={type} disabled={disabled} > {children} diff --git a/web/src/components/admin/connectors/ConnectorForm.tsx b/web/src/components/admin/connectors/ConnectorForm.tsx index 24fa9a4a4..bc7f25332 100644 --- a/web/src/components/admin/connectors/ConnectorForm.tsx +++ b/web/src/components/admin/connectors/ConnectorForm.tsx @@ -8,6 +8,8 @@ import { ValidInputTypes, ValidSources, } from "@/lib/types"; +import { deleteConnectorIfExists } from "@/lib/connector"; +import { FormBodyBuilder, RequireAtLeastOne } from "./types"; export async function submitConnector( connector: ConnectorBase @@ -35,28 +37,36 @@ export async function submitConnector( } } -interface Props { +interface BaseProps { nameBuilder: (values: T) => string; source: ValidSources; inputType: ValidInputTypes; credentialId?: number; - formBody: JSX.Element | null; + // If both are specified, uses formBody + formBody?: JSX.Element | null; + formBodyBuilder?: FormBodyBuilder; validationSchema: Yup.ObjectSchema; initialValues: T; onSubmit: (isSuccess: boolean, responseJson?: Connector) => void; refreshFreq?: number; } +type ConnectorFormProps = RequireAtLeastOne< + BaseProps, + "formBody" | "formBodyBuilder" +>; + export function ConnectorForm({ nameBuilder, source, inputType, formBody, + formBodyBuilder, validationSchema, initialValues, refreshFreq, onSubmit, -}: Props): JSX.Element { +}: ConnectorFormProps): JSX.Element { const [popup, setPopup] = useState<{ message: string; type: "success" | "error"; @@ -71,6 +81,20 @@ export function ConnectorForm({ onSubmit={async (values, formikHelpers) => { formikHelpers.setSubmitting(true); + // best effort check to see if existing connector exists + // delete it if it does, the current assumption is that only + // one google drive connector will exist at a time + const errorMsg = await deleteConnectorIfExists({ + source, + }); + if (errorMsg) { + setPopup({ + message: `Unable to delete existing connector - ${errorMsg}`, + type: "error", + }); + return; + } + const { message, isSuccess, response } = await submitConnector({ name: nameBuilder(values), source, @@ -91,9 +115,9 @@ export function ConnectorForm({ onSubmit(isSuccess, response); }} > - {({ isSubmitting }) => ( + {({ isSubmitting, values }) => (
- {formBody} + {formBody ? formBody : formBodyBuilder && formBodyBuilder(values)}
+
+ +
+ ))} + +
+ )} + /> +
+ ); + return TextArrayField; +} diff --git a/web/src/components/admin/connectors/types.ts b/web/src/components/admin/connectors/types.ts new file mode 100644 index 000000000..2086f17a2 --- /dev/null +++ b/web/src/components/admin/connectors/types.ts @@ -0,0 +1,13 @@ +import * as Yup from "yup"; + +export type FormBodyBuilder = ( + values: T +) => JSX.Element; + +export type RequireAtLeastOne = Pick< + T, + Exclude +> & + { + [K in Keys]-?: Required> & Partial>>; + }[Keys]; diff --git a/web/src/lib/connector.ts b/web/src/lib/connector.ts index 732eb7c13..5a2bfee98 100644 --- a/web/src/lib/connector.ts +++ b/web/src/lib/connector.ts @@ -76,12 +76,12 @@ export async function deleteConnectorIfExists({ const connectorsResponse = await fetch("/api/manage/connector"); if (connectorsResponse.ok) { const connectors = (await connectorsResponse.json()) as Connector[]; - const googleDriveConnectors = connectors.filter( + const matchingConnectors = connectors.filter( (connector) => connector.source === source && (!name || connector.name === name) ); - if (googleDriveConnectors.length > 0) { - const errorMsg = await deleteConnector(googleDriveConnectors[0].id); + if (matchingConnectors.length > 0) { + const errorMsg = await deleteConnector(matchingConnectors[0].id); if (errorMsg) { return errorMsg; } diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 4e0be9d72..63de4d183 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -45,6 +45,10 @@ export interface GithubConfig { repo_name: string; } +export interface GoogleDriveConfig { + folder_paths: string[]; +} + export interface BookstackConfig {} export interface ConfluenceConfig {