Allow specification of specific google drive folders to index (#197)

This commit is contained in:
Chris Weaver 2023-07-17 14:51:16 -07:00 committed by GitHub
parent bc24ac53c0
commit 3b1a8274a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 278 additions and 244 deletions

View File

@ -1,6 +1,7 @@
import datetime
import io
from collections.abc import Generator
from itertools import chain
from typing import Any
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
@ -23,25 +24,49 @@ from PyPDF2 import PdfReader
logger = setup_logger()
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
SCOPES = [
"https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/drive.metadata.readonly",
]
SUPPORTED_DRIVE_DOC_TYPES = [
"application/vnd.google-apps.document",
"application/pdf",
"application/vnd.google-apps.spreadsheet",
]
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
ID_KEY = "id"
LINK_KEY = "link"
TYPE_KEY = "type"
def get_folder_id(
service: discovery.Resource, parent_id: str, folder_name: str
) -> str | None:
"""
Get the ID of a folder given its name and the ID of its parent folder.
"""
query = f"'{parent_id}' in parents and name='{folder_name}' and mimeType='{DRIVE_FOLDER_TYPE}'"
results = (
service.files()
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
.execute()
)
items = results.get("files", [])
return items[0]["id"] if items else None
def get_file_batches(
service: discovery.Resource,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
batch_size: int = INDEX_BATCH_SIZE,
time_range_start: SecondsSinceUnixEpoch | None = None,
time_range_end: SecondsSinceUnixEpoch | None = None,
folder_id: str | None = None, # if specified, only fetches files within this folder
# if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified.
traverse_subfolders: bool = True,
) -> Generator[list[dict[str, str]], None, None]:
next_page_token = ""
subfolders: list[dict[str, str]] = []
while next_page_token is not None:
query = ""
if time_range_start is not None:
@ -53,7 +78,10 @@ def get_file_batches(
time_stop = (
datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
)
query += f"and modifiedTime <= '{time_stop}'"
query += f"and modifiedTime <= '{time_stop}' "
if folder_id:
query += f"and '{folder_id}' in parents "
query = query.rstrip() # remove the trailing space(s)
results = (
service.files()
@ -69,14 +97,30 @@ def get_file_batches(
next_page_token = results.get("nextPageToken")
files = results["files"]
valid_files: list[dict[str, str]] = []
for file in files:
if file["mimeType"] in SUPPORTED_DRIVE_DOC_TYPES:
valid_files.append(file)
elif file["mimeType"] == DRIVE_FOLDER_TYPE:
subfolders.append(file)
logger.info(
f"Parseable Documents in batch: {[file['name'] for file in valid_files]}"
)
yield valid_files
if traverse_subfolders:
for subfolder in subfolders:
logger.info("Fetching all files in subfolder: " + subfolder["name"])
yield from get_file_batches(
service=service,
include_shared=include_shared,
batch_size=batch_size,
time_range_start=time_range_start,
time_range_end=time_range_end,
folder_id=subfolder["id"],
traverse_subfolders=traverse_subfolders,
)
def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
mime_type = file["mimeType"]
@ -105,13 +149,36 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
class GoogleDriveConnector(LoadConnector, PollConnector):
def __init__(
self,
# optional list of folder paths e.g. "[My Folder/My Subfolder]"
# if specified, will only index files in these folders
folder_paths: list[str] | None = None,
batch_size: int = INDEX_BATCH_SIZE,
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
) -> None:
self.folder_paths = folder_paths or []
self.batch_size = batch_size
self.include_shared = include_shared
self.creds: Credentials | None = None
@staticmethod
def _process_folder_paths(
service: discovery.Resource, folder_paths: list[str]
) -> list[str]:
"""['Folder/Sub Folder'] -> ['<FOLDER_ID>']"""
folder_ids: list[str] = []
for path in folder_paths:
folder_names = path.split("/")
parent_id = "root"
for folder_name in folder_names:
parent_id = get_folder_id(
service=service, parent_id=parent_id, folder_name=folder_name
)
if parent_id is None:
raise ValueError(f"Folder path '{path}' not found in Google Drive")
folder_ids.append(parent_id)
return folder_ids
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
access_token_json_str = credentials[DB_CREDENTIALS_DICT_KEY]
creds = get_drive_tokens(token_json_str=access_token_json_str)
@ -132,13 +199,25 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
raise PermissionError("Not logged into Google Drive")
service = discovery.build("drive", "v3", credentials=self.creds)
for files_batch in get_file_batches(
service,
self.include_shared,
self.batch_size,
time_range_start=start,
time_range_end=end,
):
folder_ids = self._process_folder_paths(service, self.folder_paths)
if not folder_ids:
folder_ids = [None]
file_batches = chain(
*[
get_file_batches(
service=service,
include_shared=self.include_shared,
batch_size=self.batch_size,
time_range_start=start,
time_range_end=end,
folder_id=folder_id,
traverse_subfolders=True,
)
for folder_id in folder_ids
]
)
for files_batch in file_batches:
doc_batch = []
for file in files_batch:
text_contents = extract_text(file, service)

View File

@ -1,5 +1,6 @@
"use client";
import * as Yup from "yup";
import { GoogleDriveIcon } from "@/components/icons/icons";
import useSWR, { useSWRConfig } from "swr";
import { fetcher } from "@/lib/fetcher";
@ -10,18 +11,19 @@ import { useState } from "react";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { Button } from "@/components/Button";
import {
Connector,
ConnectorBase,
ConnectorIndexingStatus,
Credential,
GoogleDriveConfig,
GoogleDriveCredentialJson,
} from "@/lib/types";
import { deleteConnector, deleteConnectorIfExists } from "@/lib/connector";
import { deleteConnector } from "@/lib/connector";
import { StatusRow } from "@/components/admin/connectors/table/ConnectorsTable";
import { setupGoogleDriveOAuth } from "@/lib/googleDrive";
import Cookies from "js-cookie";
import { GOOGLE_DRIVE_AUTH_IS_ADMIN_COOKIE_NAME } from "@/lib/constants";
import { deleteCredential, linkCredential } from "@/lib/credential";
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
import { TextArrayFieldBuilder } from "@/components/admin/connectors/Field";
const AppCredentialUpload = ({
setPopup,
@ -97,7 +99,7 @@ interface GoogleDriveConnectorManagementProps {
googleDrivePublicCredential:
| Credential<GoogleDriveCredentialJson>
| undefined;
googleDriveConnectorIndexingStatus: ConnectorIndexingStatus<{}> | null;
googleDriveConnectorIndexingStatus: ConnectorIndexingStatus<GoogleDriveConfig> | null;
credentialIsLinked: boolean;
setPopup: (popupSpec: PopupSpec | null) => void;
}
@ -125,79 +127,50 @@ const GoogleDriveConnectorManagement = ({
return (
<>
<p className="text-sm mb-2">
Click the button below to create a connector. We will refresh the
Fill out the form below to create a connector. We will refresh the
latest documents from Google Drive every <b>10</b> minutes.
</p>
<Button
onClick={async () => {
// best effort check to see if existing connector exists
// delete it if it does, the current assumption is that only
// one google drive connector will exist at a time
const errorMsg = await deleteConnectorIfExists({
source: "google_drive",
});
if (errorMsg) {
setPopup({
message: `Unable to delete existing connector - ${errorMsg}`,
type: "error",
});
return;
<div className="border-solid border-gray-600 border rounded-md p-6 mt-4">
<h2 className="font-bold mb-3">Add Connector</h2>
<ConnectorForm<GoogleDriveConfig>
nameBuilder={(values) =>
`GoogleDriveConnector-${values.folder_paths.join("_")}`
}
const connectorBase: ConnectorBase<{}> = {
name: "GoogleDriveConnector",
input_type: "poll",
source: "google_drive",
connector_specific_config: {},
refresh_freq: 60 * 10, // 10 minutes
disabled: false,
};
const connectorCreationResponse = await fetch(
`/api/manage/admin/connector`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(connectorBase),
source="google_drive"
inputType="poll"
formBodyBuilder={TextArrayFieldBuilder({
name: "folder_paths",
label: "Folder paths:",
subtext:
"Specify 0 or more folder paths to index! For example, specifying the path " +
"'Engineering/Materials' will cause us to only index all files contained " +
"within the 'Materials' folder within the 'Engineering' folder. " +
"If no folder paths are specified, we will index all documents in your drive.",
})}
validationSchema={Yup.object().shape({
folder_paths: Yup.array()
.of(
Yup.string().required(
"Please specify a folder path for your google drive e.g. 'Engineering/Materials'"
)
)
.required(),
})}
initialValues={{
folder_paths: [],
}}
refreshFreq={10 * 60} // 10 minutes
onSubmit={async (isSuccess, responseJson) => {
if (isSuccess && responseJson) {
await linkCredential(
responseJson.id,
googleDrivePublicCredential.id
);
mutate("/api/manage/admin/connector/indexing-status");
}
);
if (!connectorCreationResponse.ok) {
setPopup({
message: `Failed to create connector - ${connectorCreationResponse.status}`,
type: "error",
});
return;
}
const connector =
(await connectorCreationResponse.json()) as Connector<{}>;
const credentialLinkResponse = await fetch(
`/api/manage/connector/${connector.id}/credential/${googleDrivePublicCredential.id}`,
{
method: "PUT",
headers: {
"Content-Type": "application/json",
},
}
);
if (!credentialLinkResponse.ok) {
setPopup({
message: `Failed to link connector to credential - ${credentialLinkResponse.status}`,
type: "error",
});
return;
}
setPopup({
message: "Successfully created connector!",
type: "success",
});
mutate("/api/manage/admin/connector/indexing-status");
}}
>
Add
</Button>
}}
/>
</div>
</>
);
}
@ -231,8 +204,8 @@ const GoogleDriveConnectorManagement = ({
return (
<div>
<div className="text-sm mb-2">
<div className="flex mb-1">
<div className="text-sm">
<div className="flex">
The Google Drive connector is setup! <b className="mx-2">Status:</b>{" "}
<StatusRow
connectorIndexingStatus={googleDriveConnectorIndexingStatus}
@ -246,7 +219,22 @@ const GoogleDriveConnectorManagement = ({
}}
/>
</div>
<p>
{googleDriveConnectorIndexingStatus.connector.connector_specific_config
.folder_paths.length > 0 && (
<div className="mt-3">
It is setup to index the following folders:{" "}
<div className="mx-2">
{googleDriveConnectorIndexingStatus.connector.connector_specific_config.folder_paths.map(
(path) => (
<div key={path}>
- <i>{path}</i>
</div>
)
)}
</div>
</div>
)}
<p className="mt-3">
Checkout the{" "}
<a href="/admin/indexing/status" className="text-blue-500">
status page
@ -274,158 +262,12 @@ const GoogleDriveConnectorManagement = ({
}
);
}}
className="mt-2"
>
Delete Connector
</Button>
</div>
);
// return (
// <>
// {googleDrivePublicCredential ? (
// googleDriveConnectorIndexingStatus ? (
// credentialIsLinked ? (
// <div>
// <div className="text-sm mb-2">
// <div className="flex mb-1">
// The Google Drive connector is setup!{" "}
// <b className="mx-2">Status:</b>{" "}
// <StatusRow
// connectorIndexingStatus={googleDriveConnectorIndexingStatus}
// hasCredentialsIssue={
// googleDriveConnectorIndexingStatus.connector
// .credential_ids.length === 0
// }
// setPopup={setPopup}
// onUpdate={() => {
// mutate("/api/manage/admin/connector/indexing-status");
// }}
// />
// </div>
// <p>
// Checkout the{" "}
// <a href="/admin/indexing/status" className="text-blue-500">
// status page
// </a>{" "}
// for the latest indexing status. We fetch the latest documents
// from Google Drive every <b>10</b> minutes.
// </p>
// </div>
// <Button
// onClick={() => {
// deleteConnector(
// googleDriveConnectorIndexingStatus.connector.id
// ).then(() => {
// setPopup({
// message: "Successfully deleted connector!",
// type: "success",
// });
// mutate("/api/manage/admin/connector/indexing-status");
// });
// }}
// >
// Delete Connector
// </Button>
// </div>
// ) : (
// <>
// <p className="text-sm mb-2">
// Click the button below to link your credentials! Once this is
// done, all public documents in your Google Drive will be
// searchable. We will refresh the latest documents every <b>10</b>{" "}
// minutes.
// </p>
// <Button
// onClick={async () => {
// await linkCredential(
// googleDriveConnectorIndexingStatus.connector.id,
// googleDrivePublicCredential.id
// );
// setPopup({
// message: "Successfully linked credentials!",
// type: "success",
// });
// mutate("/api/manage/admin/connector/indexing-status");
// }}
// >
// Link Credentials
// </Button>
// </>
// )
// ) : (
// <>
// <p className="text-sm mb-2">
// Click the button below to create a connector. We will refresh the
// latest documents from Google Drive every <b>10</b> minutes.
// </p>
// <Button
// onClick={async () => {
// // if (connector.)
// const connectorBase: ConnectorBase<{}> = {
// name: "GoogleDriveConnector",
// input_type: "load_state",
// source: "google_drive",
// connector_specific_config: {},
// refresh_freq: 60 * 10, // 10 minutes
// disabled: false,
// };
// const connectorCreationResponse = await fetch(
// `/api/manage/admin/connector`,
// {
// method: "POST",
// headers: {
// "Content-Type": "application/json",
// },
// body: JSON.stringify(connectorBase),
// }
// );
// if (!connectorCreationResponse.ok) {
// setPopup({
// message: `Failed to create connector - ${connectorCreationResponse.status}`,
// type: "error",
// });
// return;
// }
// const connector =
// (await connectorCreationResponse.json()) as Connector<{}>;
// const credentialLinkResponse = await fetch(
// `/api/manage/connector/${connector.id}/credential/${googleDrivePublicCredential.id}`,
// {
// method: "PUT",
// headers: {
// "Content-Type": "application/json",
// },
// }
// );
// if (!credentialLinkResponse.ok) {
// setPopup({
// message: `Failed to link connector to credential - ${credentialLinkResponse.status}`,
// type: "error",
// });
// return;
// }
// setPopup({
// message: "Successfully created connector!",
// type: "success",
// });
// mutate("/api/manage/admin/connector/indexing-status");
// }}
// >
// Add
// </Button>
// </>
// )
// ) : (
// <p className="text-sm">
// Please authenticate with Google Drive as described in Step 2! Once
// done with that, you can then move on to enable this connector.
// </p>
// )}
// </>
// );
};
const Main = () => {
@ -510,7 +352,7 @@ const Main = () => {
(credential) =>
credential.credential_json?.google_drive_tokens && credential.public_doc
);
const googleDriveConnectorIndexingStatuses: ConnectorIndexingStatus<{}>[] =
const googleDriveConnectorIndexingStatuses: ConnectorIndexingStatus<GoogleDriveConfig>[] =
connectorIndexingStatuses.filter(
(connectorIndexingStatus) =>
connectorIndexingStatus.connector.source === "google_drive"

View File

@ -7,7 +7,6 @@ import useSWR, { useSWRConfig } from "swr";
import { LoadingAnimation } from "@/components/Loading";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import {
Connector,
SlackConfig,
Credential,
SlackCredentialJson,

View File

@ -1,6 +1,7 @@
interface Props {
children: JSX.Element | string;
onClick?: () => void;
onClick?: React.MouseEventHandler<HTMLButtonElement>;
type?: "button" | "submit" | "reset";
disabled?: boolean;
fullWidth?: boolean;
className?: string;
@ -9,6 +10,7 @@ interface Props {
export const Button = ({
children,
onClick,
type = "submit",
disabled = false,
fullWidth = false,
className = "",
@ -26,6 +28,7 @@ export const Button = ({
className
}
onClick={onClick}
type={type}
disabled={disabled}
>
{children}

View File

@ -8,6 +8,8 @@ import {
ValidInputTypes,
ValidSources,
} from "@/lib/types";
import { deleteConnectorIfExists } from "@/lib/connector";
import { FormBodyBuilder, RequireAtLeastOne } from "./types";
export async function submitConnector<T>(
connector: ConnectorBase<T>
@ -35,28 +37,36 @@ export async function submitConnector<T>(
}
}
interface Props<T extends Yup.AnyObject> {
interface BaseProps<T extends Yup.AnyObject> {
nameBuilder: (values: T) => string;
source: ValidSources;
inputType: ValidInputTypes;
credentialId?: number;
formBody: JSX.Element | null;
// If both are specified, uses formBody
formBody?: JSX.Element | null;
formBodyBuilder?: FormBodyBuilder<T>;
validationSchema: Yup.ObjectSchema<T>;
initialValues: T;
onSubmit: (isSuccess: boolean, responseJson?: Connector<T>) => void;
refreshFreq?: number;
}
type ConnectorFormProps<T extends Yup.AnyObject> = RequireAtLeastOne<
BaseProps<T>,
"formBody" | "formBodyBuilder"
>;
export function ConnectorForm<T extends Yup.AnyObject>({
nameBuilder,
source,
inputType,
formBody,
formBodyBuilder,
validationSchema,
initialValues,
refreshFreq,
onSubmit,
}: Props<T>): JSX.Element {
}: ConnectorFormProps<T>): JSX.Element {
const [popup, setPopup] = useState<{
message: string;
type: "success" | "error";
@ -71,6 +81,20 @@ export function ConnectorForm<T extends Yup.AnyObject>({
onSubmit={async (values, formikHelpers) => {
formikHelpers.setSubmitting(true);
// best effort check to see if existing connector exists
// delete it if it does, the current assumption is that only
// one google drive connector will exist at a time
const errorMsg = await deleteConnectorIfExists({
source,
});
if (errorMsg) {
setPopup({
message: `Unable to delete existing connector - ${errorMsg}`,
type: "error",
});
return;
}
const { message, isSuccess, response } = await submitConnector<T>({
name: nameBuilder(values),
source,
@ -91,9 +115,9 @@ export function ConnectorForm<T extends Yup.AnyObject>({
onSubmit(isSuccess, response);
}}
>
{({ isSubmitting }) => (
{({ isSubmitting, values }) => (
<Form>
{formBody}
{formBody ? formBody : formBodyBuilder && formBodyBuilder(values)}
<div className="flex">
<button
type="submit"

View File

@ -1,4 +1,7 @@
import { ErrorMessage, Field } from "formik";
import { Button } from "@/components/Button";
import { ErrorMessage, Field, FieldArray } from "formik";
import * as Yup from "yup";
import { FormBodyBuilder } from "./types";
interface TextFormFieldProps {
name: string;
@ -33,3 +36,70 @@ export const TextFormField = ({
</div>
);
};
interface TextArrayFieldProps {
name: string;
label: string;
subtext?: string;
type?: string;
}
export function TextArrayFieldBuilder<T extends Yup.AnyObject>({
name,
label,
subtext,
type = "text",
}: TextArrayFieldProps): FormBodyBuilder<T> {
const TextArrayField: FormBodyBuilder<T> = (values) => (
<div className="mb-4">
<label htmlFor={name} className="block">
{label}
</label>
{subtext && <p className="text-xs">{subtext}</p>}
<FieldArray
name={name}
render={(arrayHelpers) => (
<div>
{values[name] &&
values[name].length > 0 &&
(values[name] as string[]).map((_, index) => (
<div key={index} className="mt-2">
<div className="flex">
<Field
type={type}
name={`${name}.${index}`}
id={name}
className="border bg-slate-700 text-gray-200 border-gray-300 rounded w-full py-2 px-3 mr-2"
/>
<Button
type="button"
onClick={() => arrayHelpers.remove(index)}
className="h-8 my-auto"
>
Remove
</Button>
</div>
<ErrorMessage
name={`${name}.${index}`}
component="div"
className="text-red-500 text-sm mt-1"
/>
</div>
))}
<Button
type="button"
onClick={() => {
arrayHelpers.push("");
}}
className="mt-3"
>
Add New
</Button>
</div>
)}
/>
</div>
);
return TextArrayField;
}

View File

@ -0,0 +1,13 @@
import * as Yup from "yup";
export type FormBodyBuilder<T extends Yup.AnyObject> = (
values: T
) => JSX.Element;
export type RequireAtLeastOne<T, Keys extends keyof T = keyof T> = Pick<
T,
Exclude<keyof T, Keys>
> &
{
[K in Keys]-?: Required<Pick<T, K>> & Partial<Pick<T, Exclude<Keys, K>>>;
}[Keys];

View File

@ -76,12 +76,12 @@ export async function deleteConnectorIfExists({
const connectorsResponse = await fetch("/api/manage/connector");
if (connectorsResponse.ok) {
const connectors = (await connectorsResponse.json()) as Connector<any>[];
const googleDriveConnectors = connectors.filter(
const matchingConnectors = connectors.filter(
(connector) =>
connector.source === source && (!name || connector.name === name)
);
if (googleDriveConnectors.length > 0) {
const errorMsg = await deleteConnector(googleDriveConnectors[0].id);
if (matchingConnectors.length > 0) {
const errorMsg = await deleteConnector(matchingConnectors[0].id);
if (errorMsg) {
return errorMsg;
}

View File

@ -45,6 +45,10 @@ export interface GithubConfig {
repo_name: string;
}
export interface GoogleDriveConfig {
folder_paths: string[];
}
export interface BookstackConfig {}
export interface ConfluenceConfig {