mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-17 00:23:09 +02:00
fixed shared folder issue (#4371)
* fixed shared folder issue * fix existing tests * default allow files shared with me for service account
This commit is contained in:
@ -28,7 +28,9 @@ from onyx.connectors.google_drive.doc_conversion import (
|
|||||||
)
|
)
|
||||||
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
|
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
|
||||||
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
|
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
|
||||||
from onyx.connectors.google_drive.file_retrieval import get_all_files_in_my_drive
|
from onyx.connectors.google_drive.file_retrieval import (
|
||||||
|
get_all_files_in_my_drive_and_shared,
|
||||||
|
)
|
||||||
from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
|
from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
|
||||||
from onyx.connectors.google_drive.file_retrieval import get_root_folder_id
|
from onyx.connectors.google_drive.file_retrieval import get_root_folder_id
|
||||||
from onyx.connectors.google_drive.models import DriveRetrievalStage
|
from onyx.connectors.google_drive.models import DriveRetrievalStage
|
||||||
@ -86,13 +88,18 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
|
|||||||
|
|
||||||
def _convert_single_file(
|
def _convert_single_file(
|
||||||
creds: Any,
|
creds: Any,
|
||||||
primary_admin_email: str,
|
|
||||||
allow_images: bool,
|
allow_images: bool,
|
||||||
size_threshold: int,
|
size_threshold: int,
|
||||||
|
retriever_email: str,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
) -> Document | ConnectorFailure | None:
|
) -> Document | ConnectorFailure | None:
|
||||||
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
# We used to always get the user email from the file owners when available,
|
||||||
|
# but this was causing issues with shared folders where the owner was not included in the service account
|
||||||
|
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
|
||||||
|
# wanting to retry with file owners and/or admin email at some point.
|
||||||
|
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||||
|
|
||||||
|
user_email = retriever_email
|
||||||
# Only construct these services when needed
|
# Only construct these services when needed
|
||||||
user_drive_service = lazy_eval(
|
user_drive_service = lazy_eval(
|
||||||
lambda: get_drive_service(creds, user_email=user_email)
|
lambda: get_drive_service(creds, user_email=user_email)
|
||||||
@ -450,10 +457,11 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
|||||||
logger.info(f"Getting all files in my drive as '{user_email}'")
|
logger.info(f"Getting all files in my drive as '{user_email}'")
|
||||||
|
|
||||||
yield from add_retrieval_info(
|
yield from add_retrieval_info(
|
||||||
get_all_files_in_my_drive(
|
get_all_files_in_my_drive_and_shared(
|
||||||
service=drive_service,
|
service=drive_service,
|
||||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||||
is_slim=is_slim,
|
is_slim=is_slim,
|
||||||
|
include_shared_with_me=self.include_files_shared_with_me,
|
||||||
start=curr_stage.completed_until if resuming else start,
|
start=curr_stage.completed_until if resuming else start,
|
||||||
end=end,
|
end=end,
|
||||||
),
|
),
|
||||||
@ -916,20 +924,28 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
|||||||
convert_func = partial(
|
convert_func = partial(
|
||||||
_convert_single_file,
|
_convert_single_file,
|
||||||
self.creds,
|
self.creds,
|
||||||
self.primary_admin_email,
|
|
||||||
self.allow_images,
|
self.allow_images,
|
||||||
self.size_threshold,
|
self.size_threshold,
|
||||||
)
|
)
|
||||||
# Fetch files in batches
|
# Fetch files in batches
|
||||||
batches_complete = 0
|
batches_complete = 0
|
||||||
files_batch: list[GoogleDriveFileType] = []
|
files_batch: list[RetrievedDriveFile] = []
|
||||||
|
|
||||||
def _yield_batch(
|
def _yield_batch(
|
||||||
files_batch: list[GoogleDriveFileType],
|
files_batch: list[RetrievedDriveFile],
|
||||||
) -> Iterator[Document | ConnectorFailure]:
|
) -> Iterator[Document | ConnectorFailure]:
|
||||||
nonlocal batches_complete
|
nonlocal batches_complete
|
||||||
# Process the batch using run_functions_tuples_in_parallel
|
# Process the batch using run_functions_tuples_in_parallel
|
||||||
func_with_args = [(convert_func, (file,)) for file in files_batch]
|
func_with_args = [
|
||||||
|
(
|
||||||
|
convert_func,
|
||||||
|
(
|
||||||
|
file.user_email,
|
||||||
|
file.drive_file,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for file in files_batch
|
||||||
|
]
|
||||||
results = cast(
|
results = cast(
|
||||||
list[Document | ConnectorFailure | None],
|
list[Document | ConnectorFailure | None],
|
||||||
run_functions_tuples_in_parallel(func_with_args, max_workers=8),
|
run_functions_tuples_in_parallel(func_with_args, max_workers=8),
|
||||||
@ -967,7 +983,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
|
|||||||
)
|
)
|
||||||
|
|
||||||
continue
|
continue
|
||||||
files_batch.append(retrieved_file.drive_file)
|
files_batch.append(retrieved_file)
|
||||||
|
|
||||||
if len(files_batch) < self.batch_size:
|
if len(files_batch) < self.batch_size:
|
||||||
continue
|
continue
|
||||||
|
@ -87,7 +87,6 @@ def _download_and_extract_sections_basic(
|
|||||||
mime_type = file["mimeType"]
|
mime_type = file["mimeType"]
|
||||||
link = file.get("webViewLink", "")
|
link = file.get("webViewLink", "")
|
||||||
|
|
||||||
try:
|
|
||||||
# skip images if not explicitly enabled
|
# skip images if not explicitly enabled
|
||||||
if not allow_images and is_gdrive_image_mime_type(mime_type):
|
if not allow_images and is_gdrive_image_mime_type(mime_type):
|
||||||
return []
|
return []
|
||||||
@ -140,8 +139,7 @@ def _download_and_extract_sections_basic(
|
|||||||
return [TextSection(link=link, text=text)]
|
return [TextSection(link=link, text=text)]
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
mime_type
|
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||||
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
||||||
):
|
):
|
||||||
text = xlsx_to_text(io.BytesIO(response))
|
text = xlsx_to_text(io.BytesIO(response))
|
||||||
return [TextSection(link=link, text=text)]
|
return [TextSection(link=link, text=text)]
|
||||||
@ -203,10 +201,6 @@ def _download_and_extract_sections_basic(
|
|||||||
logger.warning(f"Failed to extract text from {file_name}: {e}")
|
logger.warning(f"Failed to extract text from {file_name}: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing file {file_name}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def convert_drive_item_to_document(
|
def convert_drive_item_to_document(
|
||||||
file: GoogleDriveFileType,
|
file: GoogleDriveFileType,
|
||||||
|
@ -214,10 +214,11 @@ def get_files_in_shared_drive(
|
|||||||
yield file
|
yield file
|
||||||
|
|
||||||
|
|
||||||
def get_all_files_in_my_drive(
|
def get_all_files_in_my_drive_and_shared(
|
||||||
service: GoogleDriveService,
|
service: GoogleDriveService,
|
||||||
update_traversed_ids_func: Callable,
|
update_traversed_ids_func: Callable,
|
||||||
is_slim: bool,
|
is_slim: bool,
|
||||||
|
include_shared_with_me: bool,
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
end: SecondsSinceUnixEpoch | None = None,
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
) -> Iterator[GoogleDriveFileType]:
|
) -> Iterator[GoogleDriveFileType]:
|
||||||
@ -229,6 +230,7 @@ def get_all_files_in_my_drive(
|
|||||||
# Get all folders being queried and add them to the traversed set
|
# Get all folders being queried and add them to the traversed set
|
||||||
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
|
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
|
||||||
folder_query += " and trashed = false"
|
folder_query += " and trashed = false"
|
||||||
|
if not include_shared_with_me:
|
||||||
folder_query += " and 'me' in owners"
|
folder_query += " and 'me' in owners"
|
||||||
found_folders = False
|
found_folders = False
|
||||||
for file in execute_paginated_retrieval(
|
for file in execute_paginated_retrieval(
|
||||||
@ -246,6 +248,7 @@ def get_all_files_in_my_drive(
|
|||||||
# Then get the files
|
# Then get the files
|
||||||
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
|
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
|
||||||
file_query += " and trashed = false"
|
file_query += " and trashed = false"
|
||||||
|
if not include_shared_with_me:
|
||||||
file_query += " and 'me' in owners"
|
file_query += " and 'me' in owners"
|
||||||
file_query += _generate_time_range_filter(start, end)
|
file_query += _generate_time_range_filter(start, end)
|
||||||
yield from execute_paginated_retrieval(
|
yield from execute_paginated_retrieval(
|
||||||
|
@ -58,6 +58,16 @@ SECTIONS_FOLDER_URL = (
|
|||||||
"https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33"
|
"https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
EXTERNAL_SHARED_FOLDER_URL = (
|
||||||
|
"https://drive.google.com/drive/folders/1sWC7Oi0aQGgifLiMnhTjvkhRWVeDa-XS"
|
||||||
|
)
|
||||||
|
EXTERNAL_SHARED_DOCS_IN_FOLDER = [
|
||||||
|
"https://docs.google.com/document/d/1Sywmv1-H6ENk2GcgieKou3kQHR_0te1mhIUcq8XlcdY"
|
||||||
|
]
|
||||||
|
EXTERNAL_SHARED_DOC_SINGLETON = (
|
||||||
|
"https://docs.google.com/document/d/11kmisDfdvNcw5LYZbkdPVjTOdj-Uc5ma6Jep68xzeeA"
|
||||||
|
)
|
||||||
|
|
||||||
SHARED_DRIVE_3_URL = "https://drive.google.com/drive/folders/0AJYm2K_I_vtNUk9PVA"
|
SHARED_DRIVE_3_URL = "https://drive.google.com/drive/folders/0AJYm2K_I_vtNUk9PVA"
|
||||||
|
|
||||||
ADMIN_EMAIL = "admin@onyx-test.com"
|
ADMIN_EMAIL = "admin@onyx-test.com"
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||||
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
|
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
|
||||||
@ -9,6 +10,15 @@ from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_
|
|||||||
from tests.daily.connectors.google_drive.consts_and_utils import (
|
from tests.daily.connectors.google_drive.consts_and_utils import (
|
||||||
assert_expected_docs_in_retrieved_docs,
|
assert_expected_docs_in_retrieved_docs,
|
||||||
)
|
)
|
||||||
|
from tests.daily.connectors.google_drive.consts_and_utils import (
|
||||||
|
EXTERNAL_SHARED_DOC_SINGLETON,
|
||||||
|
)
|
||||||
|
from tests.daily.connectors.google_drive.consts_and_utils import (
|
||||||
|
EXTERNAL_SHARED_DOCS_IN_FOLDER,
|
||||||
|
)
|
||||||
|
from tests.daily.connectors.google_drive.consts_and_utils import (
|
||||||
|
EXTERNAL_SHARED_FOLDER_URL,
|
||||||
|
)
|
||||||
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS
|
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS
|
||||||
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL
|
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL
|
||||||
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS
|
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS
|
||||||
@ -100,7 +110,8 @@ def test_include_shared_drives_only_with_size_threshold(
|
|||||||
|
|
||||||
retrieved_docs = load_all_docs(connector)
|
retrieved_docs = load_all_docs(connector)
|
||||||
|
|
||||||
assert len(retrieved_docs) == 50
|
# 2 extra files from shared drive owned by non-admin and not shared with admin
|
||||||
|
assert len(retrieved_docs) == 52
|
||||||
|
|
||||||
|
|
||||||
@patch(
|
@patch(
|
||||||
@ -137,7 +148,8 @@ def test_include_shared_drives_only(
|
|||||||
+ SECTIONS_FILE_IDS
|
+ SECTIONS_FILE_IDS
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(retrieved_docs) == 51
|
# 2 extra files from shared drive owned by non-admin and not shared with admin
|
||||||
|
assert len(retrieved_docs) == 53
|
||||||
|
|
||||||
assert_expected_docs_in_retrieved_docs(
|
assert_expected_docs_in_retrieved_docs(
|
||||||
retrieved_docs=retrieved_docs,
|
retrieved_docs=retrieved_docs,
|
||||||
@ -294,6 +306,64 @@ def test_folders_only(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_shared_folder_owned_by_external_user(
|
||||||
|
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||||
|
) -> None:
|
||||||
|
print("\n\nRunning test_shared_folder_owned_by_external_user")
|
||||||
|
connector = google_drive_service_acct_connector_factory(
|
||||||
|
primary_admin_email=ADMIN_EMAIL,
|
||||||
|
include_shared_drives=False,
|
||||||
|
include_my_drives=False,
|
||||||
|
include_files_shared_with_me=False,
|
||||||
|
shared_drive_urls=None,
|
||||||
|
shared_folder_urls=EXTERNAL_SHARED_FOLDER_URL,
|
||||||
|
my_drive_emails=None,
|
||||||
|
)
|
||||||
|
retrieved_docs = load_all_docs(connector)
|
||||||
|
|
||||||
|
expected_docs = EXTERNAL_SHARED_DOCS_IN_FOLDER
|
||||||
|
|
||||||
|
assert len(retrieved_docs) == len(expected_docs) # 1 for now
|
||||||
|
assert expected_docs[0] in retrieved_docs[0].id
|
||||||
|
|
||||||
|
|
||||||
|
def test_shared_with_me(
|
||||||
|
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
|
||||||
|
) -> None:
|
||||||
|
print("\n\nRunning test_shared_with_me")
|
||||||
|
connector = google_drive_service_acct_connector_factory(
|
||||||
|
primary_admin_email=ADMIN_EMAIL,
|
||||||
|
include_shared_drives=False,
|
||||||
|
include_my_drives=True,
|
||||||
|
include_files_shared_with_me=True,
|
||||||
|
shared_drive_urls=None,
|
||||||
|
shared_folder_urls=None,
|
||||||
|
my_drive_emails=None,
|
||||||
|
)
|
||||||
|
retrieved_docs = load_all_docs(connector)
|
||||||
|
|
||||||
|
print(retrieved_docs)
|
||||||
|
|
||||||
|
expected_file_ids = (
|
||||||
|
ADMIN_FILE_IDS
|
||||||
|
+ ADMIN_FOLDER_3_FILE_IDS
|
||||||
|
+ TEST_USER_1_FILE_IDS
|
||||||
|
+ TEST_USER_2_FILE_IDS
|
||||||
|
+ TEST_USER_3_FILE_IDS
|
||||||
|
)
|
||||||
|
assert_expected_docs_in_retrieved_docs(
|
||||||
|
retrieved_docs=retrieved_docs,
|
||||||
|
expected_file_ids=expected_file_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
retrieved_ids = {urlparse(doc.id).path.split("/")[-2] for doc in retrieved_docs}
|
||||||
|
for id in retrieved_ids:
|
||||||
|
print(id)
|
||||||
|
|
||||||
|
assert EXTERNAL_SHARED_DOC_SINGLETON.split("/")[-1] in retrieved_ids
|
||||||
|
assert EXTERNAL_SHARED_DOCS_IN_FOLDER[0].split("/")[-1] in retrieved_ids
|
||||||
|
|
||||||
|
|
||||||
@patch(
|
@patch(
|
||||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||||
return_value=None,
|
return_value=None,
|
||||||
|
@ -281,7 +281,7 @@ export default function AddConnector({
|
|||||||
return (
|
return (
|
||||||
<Formik
|
<Formik
|
||||||
initialValues={{
|
initialValues={{
|
||||||
...createConnectorInitialValues(connector),
|
...createConnectorInitialValues(connector, currentCredential),
|
||||||
...Object.fromEntries(
|
...Object.fromEntries(
|
||||||
connectorConfigs[connector].advanced_values.map((field) => [
|
connectorConfigs[connector].advanced_values.map((field) => [
|
||||||
field.name,
|
field.name,
|
||||||
|
@ -1292,7 +1292,8 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
export function createConnectorInitialValues(
|
export function createConnectorInitialValues(
|
||||||
connector: ConfigurableSources
|
connector: ConfigurableSources,
|
||||||
|
currentCredential: Credential<any> | null = null
|
||||||
): Record<string, any> & AccessTypeGroupSelectorFormType {
|
): Record<string, any> & AccessTypeGroupSelectorFormType {
|
||||||
const configuration = connectorConfigs[connector];
|
const configuration = connectorConfigs[connector];
|
||||||
|
|
||||||
@ -1307,7 +1308,16 @@ export function createConnectorInitialValues(
|
|||||||
} else if (field.type === "list") {
|
} else if (field.type === "list") {
|
||||||
acc[field.name] = field.default || [];
|
acc[field.name] = field.default || [];
|
||||||
} else if (field.type === "checkbox") {
|
} else if (field.type === "checkbox") {
|
||||||
|
// Special case for include_files_shared_with_me when using service account
|
||||||
|
if (
|
||||||
|
field.name === "include_files_shared_with_me" &&
|
||||||
|
currentCredential &&
|
||||||
|
!currentCredential.credential_json?.google_tokens
|
||||||
|
) {
|
||||||
|
acc[field.name] = true;
|
||||||
|
} else {
|
||||||
acc[field.name] = field.default || false;
|
acc[field.name] = field.default || false;
|
||||||
|
}
|
||||||
} else if (field.default !== undefined) {
|
} else if (field.default !== undefined) {
|
||||||
acc[field.name] = field.default;
|
acc[field.name] = field.default;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user