mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 20:08:36 +02:00
Google drive shared files fix + shortcuts (#300)
Also fixes foreign key constraint issue when manually wiping postgres + keeps track of accessed folders
This commit is contained in:
parent
11c071da33
commit
81a4934bb8
@ -93,6 +93,7 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB", "postgres")
|
||||
# Connector Configs
|
||||
#####
|
||||
GOOGLE_DRIVE_INCLUDE_SHARED = False
|
||||
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False
|
||||
FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
||||
"FILE_CONNECTOR_TMP_STORAGE_PATH", "/home/file_connector_storage"
|
||||
)
|
||||
|
@ -11,6 +11,7 @@ from google.oauth2.credentials import Credentials # type: ignore
|
||||
from googleapiclient import discovery # type: ignore
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
||||
from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
@ -40,9 +41,7 @@ SUPPORTED_DRIVE_DOC_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
|
||||
ID_KEY = "id"
|
||||
LINK_KEY = "link"
|
||||
TYPE_KEY = "type"
|
||||
DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut"
|
||||
|
||||
GoogleDriveFileType = dict[str, Any]
|
||||
|
||||
@ -51,6 +50,7 @@ def _run_drive_file_query(
|
||||
service: discovery.Resource,
|
||||
query: str,
|
||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> Generator[GoogleDriveFileType, None, None]:
|
||||
next_page_token = ""
|
||||
@ -61,7 +61,11 @@ def _run_drive_file_query(
|
||||
.list(
|
||||
pageSize=batch_size,
|
||||
supportsAllDrives=include_shared,
|
||||
fields="nextPageToken, files(mimeType, id, name, webViewLink)",
|
||||
includeItemsFromAllDrives=include_shared,
|
||||
fields=(
|
||||
"nextPageToken, files(mimeType, id, name, "
|
||||
"webViewLink, shortcutDetails)"
|
||||
),
|
||||
pageToken=next_page_token,
|
||||
q=query,
|
||||
)
|
||||
@ -70,45 +74,82 @@ def _run_drive_file_query(
|
||||
next_page_token = results.get("nextPageToken")
|
||||
files = results["files"]
|
||||
for file in files:
|
||||
if follow_shortcuts and "shortcutDetails" in file:
|
||||
file = service.files().get(
|
||||
fileId=file["shortcutDetails"]["targetId"],
|
||||
supportsAllDrives=include_shared,
|
||||
fields="mimeType, id, name, webViewLink, shortcutDetails",
|
||||
)
|
||||
file = file.execute()
|
||||
yield file
|
||||
|
||||
|
||||
def _get_folder_id(
|
||||
service: discovery.Resource, parent_id: str, folder_name: str
|
||||
service: discovery.Resource,
|
||||
parent_id: str,
|
||||
folder_name: str,
|
||||
include_shared: bool,
|
||||
follow_shortcuts: bool,
|
||||
) -> str | None:
|
||||
"""
|
||||
Get the ID of a folder given its name and the ID of its parent folder.
|
||||
"""
|
||||
query = (
|
||||
f"'{parent_id}' in parents and name='{folder_name}' and "
|
||||
f"mimeType='{DRIVE_FOLDER_TYPE}'"
|
||||
)
|
||||
query = f"'{parent_id}' in parents and name='{folder_name}' and "
|
||||
if follow_shortcuts:
|
||||
query += f"(mimeType='{DRIVE_FOLDER_TYPE}' or mimeType='{DRIVE_SHORTCUT_TYPE}')"
|
||||
else:
|
||||
query += f"mimeType='{DRIVE_FOLDER_TYPE}'"
|
||||
|
||||
results = (
|
||||
service.files()
|
||||
.list(q=query, spaces="drive", fields="nextPageToken, files(id, name)")
|
||||
.list(
|
||||
q=query,
|
||||
spaces="drive",
|
||||
fields="nextPageToken, files(id, name, shortcutDetails)",
|
||||
supportsAllDrives=include_shared,
|
||||
includeItemsFromAllDrives=include_shared,
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
items = results.get("files", [])
|
||||
return items[0]["id"] if items else None
|
||||
|
||||
folder_id = None
|
||||
if items:
|
||||
if follow_shortcuts and "shortcutDetails" in items[0]:
|
||||
folder_id = items[0]["shortcutDetails"]["targetId"]
|
||||
else:
|
||||
folder_id = items[0]["id"]
|
||||
return folder_id
|
||||
|
||||
|
||||
def _get_folders(
|
||||
service: discovery.Resource,
|
||||
folder_id: str | None = None, # if specified, only fetches files within this folder
|
||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> Generator[GoogleDriveFileType, None, None]:
|
||||
query = f"mimeType = '{DRIVE_FOLDER_TYPE}' "
|
||||
if follow_shortcuts:
|
||||
query = "(" + query + f" or mimeType = '{DRIVE_SHORTCUT_TYPE}'" + ") "
|
||||
|
||||
if folder_id:
|
||||
query += f"and '{folder_id}' in parents "
|
||||
query = query.rstrip() # remove the trailing space(s)
|
||||
|
||||
yield from _run_drive_file_query(
|
||||
for file in _run_drive_file_query(
|
||||
service=service,
|
||||
query=query,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
):
|
||||
# Need to check this since file may have been a target of a shortcut
|
||||
# and not necessarily a folder
|
||||
if file["mimeType"] == DRIVE_FOLDER_TYPE:
|
||||
yield file
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
def _get_files(
|
||||
@ -117,6 +158,7 @@ def _get_files(
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
folder_id: str | None = None, # if specified, only fetches files within this folder
|
||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||
supported_drive_doc_types: list[str] = SUPPORTED_DRIVE_DOC_TYPES,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> Generator[GoogleDriveFileType, None, None]:
|
||||
@ -137,6 +179,7 @@ def _get_files(
|
||||
service=service,
|
||||
query=query,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
for file in files:
|
||||
@ -147,12 +190,15 @@ def _get_files(
|
||||
def get_all_files_batched(
|
||||
service: discovery.Resource,
|
||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
time_range_start: SecondsSinceUnixEpoch | None = None,
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
folder_id: str | None = None, # if specified, only fetches files within this folder
|
||||
# if True, will fetch files in sub-folders of the specified folder ID. Only applies if folder_id is specified.
|
||||
# if True, will fetch files in sub-folders of the specified folder ID.
|
||||
# Only applies if folder_id is specified.
|
||||
traverse_subfolders: bool = True,
|
||||
folder_ids_traversed: list[str] | None = None,
|
||||
) -> Generator[list[GoogleDriveFileType], None, None]:
|
||||
"""Gets all files matching the criteria specified by the args from Google Drive
|
||||
in batches of size `batch_size`.
|
||||
@ -163,6 +209,7 @@ def get_all_files_batched(
|
||||
time_range_end=time_range_end,
|
||||
folder_id=folder_id,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
yield from batch_generator(
|
||||
@ -174,23 +221,33 @@ def get_all_files_batched(
|
||||
)
|
||||
|
||||
if traverse_subfolders:
|
||||
folder_ids_traversed = folder_ids_traversed or []
|
||||
subfolders = _get_folders(
|
||||
service=service,
|
||||
folder_id=folder_id,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
for subfolder in subfolders:
|
||||
logger.info("Fetching all files in subfolder: " + subfolder["name"])
|
||||
yield from get_all_files_batched(
|
||||
service=service,
|
||||
include_shared=include_shared,
|
||||
batch_size=batch_size,
|
||||
time_range_start=time_range_start,
|
||||
time_range_end=time_range_end,
|
||||
folder_id=subfolder["id"],
|
||||
traverse_subfolders=traverse_subfolders,
|
||||
)
|
||||
if subfolder["id"] not in folder_ids_traversed:
|
||||
logger.info("Fetching all files in subfolder: " + subfolder["name"])
|
||||
folder_ids_traversed.append(subfolder["id"])
|
||||
yield from get_all_files_batched(
|
||||
service=service,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
batch_size=batch_size,
|
||||
time_range_start=time_range_start,
|
||||
time_range_end=time_range_end,
|
||||
folder_id=subfolder["id"],
|
||||
traverse_subfolders=traverse_subfolders,
|
||||
folder_ids_traversed=folder_ids_traversed,
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"Skipping subfolder since already traversed: " + subfolder["name"]
|
||||
)
|
||||
|
||||
|
||||
def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
@ -209,7 +266,6 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
# Default download to PDF since most types can be exported as a PDF
|
||||
elif (
|
||||
mime_type
|
||||
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
@ -220,7 +276,7 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
temp.write(word_stream.getvalue())
|
||||
temp_path = temp.name
|
||||
return docx2txt.process(temp_path)
|
||||
|
||||
# Default download to PDF since most types can be exported as a PDF
|
||||
else:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
pdf_stream = io.BytesIO(response)
|
||||
@ -236,15 +292,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
folder_paths: list[str] | None = None,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED,
|
||||
follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS,
|
||||
) -> None:
|
||||
self.folder_paths = folder_paths or []
|
||||
self.batch_size = batch_size
|
||||
self.include_shared = include_shared
|
||||
self.follow_shortcuts = follow_shortcuts
|
||||
self.creds: Credentials | None = None
|
||||
|
||||
@staticmethod
|
||||
def _process_folder_paths(
|
||||
service: discovery.Resource, folder_paths: list[str]
|
||||
service: discovery.Resource,
|
||||
folder_paths: list[str],
|
||||
include_shared: bool,
|
||||
follow_shortcuts: bool,
|
||||
) -> list[str]:
|
||||
"""['Folder/Sub Folder'] -> ['<FOLDER_ID>']"""
|
||||
folder_ids: list[str] = []
|
||||
@ -253,10 +314,19 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
parent_id = "root"
|
||||
for folder_name in folder_names:
|
||||
found_parent_id = _get_folder_id(
|
||||
service=service, parent_id=parent_id, folder_name=folder_name
|
||||
service=service,
|
||||
parent_id=parent_id,
|
||||
folder_name=folder_name,
|
||||
include_shared=include_shared,
|
||||
follow_shortcuts=follow_shortcuts,
|
||||
)
|
||||
if found_parent_id is None:
|
||||
raise ValueError(f"Folder path '{path}' not found in Google Drive")
|
||||
raise ValueError(
|
||||
(
|
||||
f"Folder '{folder_name}' in path '{path}' "
|
||||
"not found in Google Drive"
|
||||
)
|
||||
)
|
||||
parent_id = found_parent_id
|
||||
folder_ids.append(parent_id)
|
||||
|
||||
@ -283,7 +353,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
|
||||
service = discovery.build("drive", "v3", credentials=self.creds)
|
||||
folder_ids: Sequence[str | None] = self._process_folder_paths(
|
||||
service, self.folder_paths
|
||||
service, self.folder_paths, self.include_shared, self.follow_shortcuts
|
||||
)
|
||||
if not folder_ids:
|
||||
folder_ids = [None]
|
||||
@ -293,6 +363,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
get_all_files_batched(
|
||||
service=service,
|
||||
include_shared=self.include_shared,
|
||||
follow_shortcuts=self.follow_shortcuts,
|
||||
batch_size=self.batch_size,
|
||||
time_range_start=start,
|
||||
time_range_end=end,
|
||||
@ -326,9 +397,10 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
# need to subtract 10 minutes from start time to account for modifiedTime propogation
|
||||
# if a document is modified, it takes some time for the API to reflect these changes
|
||||
# if we do not have an offset, then we may "miss" the update when polling
|
||||
# need to subtract 10 minutes from start time to account for modifiedTime
|
||||
# propogation if a document is modified, it takes some time for the API to
|
||||
# reflect these changes if we do not have an offset, then we may "miss" the
|
||||
# update when polling
|
||||
yield from self._fetch_docs_from_drive(
|
||||
max(start - DRIVE_START_TIME_OFFSET, 0, 0), end
|
||||
)
|
||||
|
@ -34,6 +34,7 @@ def wipe_all_rows(database: str) -> None:
|
||||
cur.execute(f"DELETE FROM document")
|
||||
cur.execute(f"DELETE FROM connector_credential_pair")
|
||||
cur.execute(f"DELETE FROM index_attempt")
|
||||
cur.execute(f"DELETE FROM credential")
|
||||
conn.commit()
|
||||
|
||||
for table_name in table_names:
|
||||
|
@ -47,6 +47,10 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
|
||||
name="include_shared"
|
||||
label="Include Shared Files"
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="follow_shortcuts"
|
||||
label="Follow Shortcuts"
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
validationSchema={Yup.object().shape({
|
||||
@ -58,6 +62,7 @@ export const ConnectorEditPopup = ({ existingConnector, onSubmit }: Props) => {
|
||||
)
|
||||
.required(),
|
||||
include_shared: Yup.boolean().required(),
|
||||
follow_shortcuts: Yup.boolean().required(),
|
||||
})}
|
||||
onSubmit={onSubmit}
|
||||
/>
|
||||
|
@ -90,6 +90,10 @@ export const GoogleDriveConnectorsTable = ({
|
||||
header: "Include Shared",
|
||||
key: "include_shared",
|
||||
},
|
||||
{
|
||||
header: "Follow Shortcuts",
|
||||
key: "follow_shortcuts",
|
||||
},
|
||||
{
|
||||
header: "Status",
|
||||
key: "status",
|
||||
@ -132,6 +136,16 @@ export const GoogleDriveConnectorsTable = ({
|
||||
)}
|
||||
</div>
|
||||
),
|
||||
follow_shortcuts: (
|
||||
<div>
|
||||
{connectorIndexingStatus.connector.connector_specific_config
|
||||
.follow_shortcuts ? (
|
||||
<i>Yes</i>
|
||||
) : (
|
||||
<i>No</i>
|
||||
)}
|
||||
</div>
|
||||
),
|
||||
status: (
|
||||
<StatusRow
|
||||
connectorIndexingStatus={connectorIndexingStatus}
|
||||
|
@ -281,6 +281,14 @@ const GoogleDriveConnectorManagement = ({
|
||||
"that match both criteria."
|
||||
}
|
||||
/>
|
||||
<BooleanFormField
|
||||
name="follow_shortcuts"
|
||||
label="Follow Shortcuts"
|
||||
subtext={
|
||||
"If checked, then will follow shortcuts to files and folder and " +
|
||||
"attempt to index those as well."
|
||||
}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
validationSchema={Yup.object().shape({
|
||||
@ -292,10 +300,12 @@ const GoogleDriveConnectorManagement = ({
|
||||
)
|
||||
.required(),
|
||||
include_shared: Yup.boolean().required(),
|
||||
follow_shortcuts: Yup.boolean().required(),
|
||||
})}
|
||||
initialValues={{
|
||||
folder_paths: [],
|
||||
include_shared: false,
|
||||
follow_shortcuts: false,
|
||||
}}
|
||||
refreshFreq={10 * 60} // 10 minutes
|
||||
onSubmit={async (isSuccess, responseJson) => {
|
||||
|
@ -57,6 +57,7 @@ export interface GithubConfig {
|
||||
export interface GoogleDriveConfig {
|
||||
folder_paths?: string[];
|
||||
include_shared?: boolean;
|
||||
follow_shortcuts?: boolean;
|
||||
}
|
||||
|
||||
export interface BookstackConfig {}
|
||||
|
Loading…
x
Reference in New Issue
Block a user