fixed shared folder issue (#4371)

* fixed shared folder issue

* fix existing tests

* default allow files shared with me for service account
This commit is contained in:
evan-danswer
2025-03-27 16:39:52 -07:00
committed by GitHub
parent c554889baf
commit a123661c92
7 changed files with 219 additions and 116 deletions

View File

@ -28,7 +28,9 @@ from onyx.connectors.google_drive.doc_conversion import (
) )
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
from onyx.connectors.google_drive.file_retrieval import get_all_files_in_my_drive from onyx.connectors.google_drive.file_retrieval import (
get_all_files_in_my_drive_and_shared,
)
from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
from onyx.connectors.google_drive.file_retrieval import get_root_folder_id from onyx.connectors.google_drive.file_retrieval import get_root_folder_id
from onyx.connectors.google_drive.models import DriveRetrievalStage from onyx.connectors.google_drive.models import DriveRetrievalStage
@ -86,13 +88,18 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
def _convert_single_file( def _convert_single_file(
creds: Any, creds: Any,
primary_admin_email: str,
allow_images: bool, allow_images: bool,
size_threshold: int, size_threshold: int,
retriever_email: str,
file: dict[str, Any], file: dict[str, Any],
) -> Document | ConnectorFailure | None: ) -> Document | ConnectorFailure | None:
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email # We used to always get the user email from the file owners when available,
# but this was causing issues with shared folders where the owner was not included in the service account
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
# wanting to retry with file owners and/or admin email at some point.
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_email = retriever_email
# Only construct these services when needed # Only construct these services when needed
user_drive_service = lazy_eval( user_drive_service = lazy_eval(
lambda: get_drive_service(creds, user_email=user_email) lambda: get_drive_service(creds, user_email=user_email)
@ -450,10 +457,11 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
logger.info(f"Getting all files in my drive as '{user_email}'") logger.info(f"Getting all files in my drive as '{user_email}'")
yield from add_retrieval_info( yield from add_retrieval_info(
get_all_files_in_my_drive( get_all_files_in_my_drive_and_shared(
service=drive_service, service=drive_service,
update_traversed_ids_func=self._update_traversed_parent_ids, update_traversed_ids_func=self._update_traversed_parent_ids,
is_slim=is_slim, is_slim=is_slim,
include_shared_with_me=self.include_files_shared_with_me,
start=curr_stage.completed_until if resuming else start, start=curr_stage.completed_until if resuming else start,
end=end, end=end,
), ),
@ -916,20 +924,28 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
convert_func = partial( convert_func = partial(
_convert_single_file, _convert_single_file,
self.creds, self.creds,
self.primary_admin_email,
self.allow_images, self.allow_images,
self.size_threshold, self.size_threshold,
) )
# Fetch files in batches # Fetch files in batches
batches_complete = 0 batches_complete = 0
files_batch: list[GoogleDriveFileType] = [] files_batch: list[RetrievedDriveFile] = []
def _yield_batch( def _yield_batch(
files_batch: list[GoogleDriveFileType], files_batch: list[RetrievedDriveFile],
) -> Iterator[Document | ConnectorFailure]: ) -> Iterator[Document | ConnectorFailure]:
nonlocal batches_complete nonlocal batches_complete
# Process the batch using run_functions_tuples_in_parallel # Process the batch using run_functions_tuples_in_parallel
func_with_args = [(convert_func, (file,)) for file in files_batch] func_with_args = [
(
convert_func,
(
file.user_email,
file.drive_file,
),
)
for file in files_batch
]
results = cast( results = cast(
list[Document | ConnectorFailure | None], list[Document | ConnectorFailure | None],
run_functions_tuples_in_parallel(func_with_args, max_workers=8), run_functions_tuples_in_parallel(func_with_args, max_workers=8),
@ -967,7 +983,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
) )
continue continue
files_batch.append(retrieved_file.drive_file) files_batch.append(retrieved_file)
if len(files_batch) < self.batch_size: if len(files_batch) < self.batch_size:
continue continue

View File

@ -87,7 +87,6 @@ def _download_and_extract_sections_basic(
mime_type = file["mimeType"] mime_type = file["mimeType"]
link = file.get("webViewLink", "") link = file.get("webViewLink", "")
try:
# skip images if not explicitly enabled # skip images if not explicitly enabled
if not allow_images and is_gdrive_image_mime_type(mime_type): if not allow_images and is_gdrive_image_mime_type(mime_type):
return [] return []
@ -140,8 +139,7 @@ def _download_and_extract_sections_basic(
return [TextSection(link=link, text=text)] return [TextSection(link=link, text=text)]
elif ( elif (
mime_type mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
): ):
text = xlsx_to_text(io.BytesIO(response)) text = xlsx_to_text(io.BytesIO(response))
return [TextSection(link=link, text=text)] return [TextSection(link=link, text=text)]
@ -203,10 +201,6 @@ def _download_and_extract_sections_basic(
logger.warning(f"Failed to extract text from {file_name}: {e}") logger.warning(f"Failed to extract text from {file_name}: {e}")
return [] return []
except Exception as e:
logger.error(f"Error processing file {file_name}: {e}")
return []
def convert_drive_item_to_document( def convert_drive_item_to_document(
file: GoogleDriveFileType, file: GoogleDriveFileType,

View File

@ -214,10 +214,11 @@ def get_files_in_shared_drive(
yield file yield file
def get_all_files_in_my_drive( def get_all_files_in_my_drive_and_shared(
service: GoogleDriveService, service: GoogleDriveService,
update_traversed_ids_func: Callable, update_traversed_ids_func: Callable,
is_slim: bool, is_slim: bool,
include_shared_with_me: bool,
start: SecondsSinceUnixEpoch | None = None, start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]: ) -> Iterator[GoogleDriveFileType]:
@ -229,6 +230,7 @@ def get_all_files_in_my_drive(
# Get all folders being queried and add them to the traversed set # Get all folders being queried and add them to the traversed set
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'" folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
folder_query += " and trashed = false" folder_query += " and trashed = false"
if not include_shared_with_me:
folder_query += " and 'me' in owners" folder_query += " and 'me' in owners"
found_folders = False found_folders = False
for file in execute_paginated_retrieval( for file in execute_paginated_retrieval(
@ -246,6 +248,7 @@ def get_all_files_in_my_drive(
# Then get the files # Then get the files
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'" file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false" file_query += " and trashed = false"
if not include_shared_with_me:
file_query += " and 'me' in owners" file_query += " and 'me' in owners"
file_query += _generate_time_range_filter(start, end) file_query += _generate_time_range_filter(start, end)
yield from execute_paginated_retrieval( yield from execute_paginated_retrieval(

View File

@ -58,6 +58,16 @@ SECTIONS_FOLDER_URL = (
"https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33" "https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33"
) )
EXTERNAL_SHARED_FOLDER_URL = (
"https://drive.google.com/drive/folders/1sWC7Oi0aQGgifLiMnhTjvkhRWVeDa-XS"
)
EXTERNAL_SHARED_DOCS_IN_FOLDER = [
"https://docs.google.com/document/d/1Sywmv1-H6ENk2GcgieKou3kQHR_0te1mhIUcq8XlcdY"
]
EXTERNAL_SHARED_DOC_SINGLETON = (
"https://docs.google.com/document/d/11kmisDfdvNcw5LYZbkdPVjTOdj-Uc5ma6Jep68xzeeA"
)
SHARED_DRIVE_3_URL = "https://drive.google.com/drive/folders/0AJYm2K_I_vtNUk9PVA" SHARED_DRIVE_3_URL = "https://drive.google.com/drive/folders/0AJYm2K_I_vtNUk9PVA"
ADMIN_EMAIL = "admin@onyx-test.com" ADMIN_EMAIL = "admin@onyx-test.com"

View File

@ -1,6 +1,7 @@
from collections.abc import Callable from collections.abc import Callable
from unittest.mock import MagicMock from unittest.mock import MagicMock
from unittest.mock import patch from unittest.mock import patch
from urllib.parse import urlparse
from onyx.connectors.google_drive.connector import GoogleDriveConnector from onyx.connectors.google_drive.connector import GoogleDriveConnector
from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL
@ -9,6 +10,15 @@ from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_
from tests.daily.connectors.google_drive.consts_and_utils import ( from tests.daily.connectors.google_drive.consts_and_utils import (
assert_expected_docs_in_retrieved_docs, assert_expected_docs_in_retrieved_docs,
) )
from tests.daily.connectors.google_drive.consts_and_utils import (
EXTERNAL_SHARED_DOC_SINGLETON,
)
from tests.daily.connectors.google_drive.consts_and_utils import (
EXTERNAL_SHARED_DOCS_IN_FOLDER,
)
from tests.daily.connectors.google_drive.consts_and_utils import (
EXTERNAL_SHARED_FOLDER_URL,
)
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL
from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS
@ -100,7 +110,8 @@ def test_include_shared_drives_only_with_size_threshold(
retrieved_docs = load_all_docs(connector) retrieved_docs = load_all_docs(connector)
assert len(retrieved_docs) == 50 # 2 extra files from shared drive owned by non-admin and not shared with admin
assert len(retrieved_docs) == 52
@patch( @patch(
@ -137,7 +148,8 @@ def test_include_shared_drives_only(
+ SECTIONS_FILE_IDS + SECTIONS_FILE_IDS
) )
assert len(retrieved_docs) == 51 # 2 extra files from shared drive owned by non-admin and not shared with admin
assert len(retrieved_docs) == 53
assert_expected_docs_in_retrieved_docs( assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs, retrieved_docs=retrieved_docs,
@ -294,6 +306,64 @@ def test_folders_only(
) )
def test_shared_folder_owned_by_external_user(
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
) -> None:
print("\n\nRunning test_shared_folder_owned_by_external_user")
connector = google_drive_service_acct_connector_factory(
primary_admin_email=ADMIN_EMAIL,
include_shared_drives=False,
include_my_drives=False,
include_files_shared_with_me=False,
shared_drive_urls=None,
shared_folder_urls=EXTERNAL_SHARED_FOLDER_URL,
my_drive_emails=None,
)
retrieved_docs = load_all_docs(connector)
expected_docs = EXTERNAL_SHARED_DOCS_IN_FOLDER
assert len(retrieved_docs) == len(expected_docs) # 1 for now
assert expected_docs[0] in retrieved_docs[0].id
def test_shared_with_me(
google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector],
) -> None:
print("\n\nRunning test_shared_with_me")
connector = google_drive_service_acct_connector_factory(
primary_admin_email=ADMIN_EMAIL,
include_shared_drives=False,
include_my_drives=True,
include_files_shared_with_me=True,
shared_drive_urls=None,
shared_folder_urls=None,
my_drive_emails=None,
)
retrieved_docs = load_all_docs(connector)
print(retrieved_docs)
expected_file_ids = (
ADMIN_FILE_IDS
+ ADMIN_FOLDER_3_FILE_IDS
+ TEST_USER_1_FILE_IDS
+ TEST_USER_2_FILE_IDS
+ TEST_USER_3_FILE_IDS
)
assert_expected_docs_in_retrieved_docs(
retrieved_docs=retrieved_docs,
expected_file_ids=expected_file_ids,
)
retrieved_ids = {urlparse(doc.id).path.split("/")[-2] for doc in retrieved_docs}
for id in retrieved_ids:
print(id)
assert EXTERNAL_SHARED_DOC_SINGLETON.split("/")[-1] in retrieved_ids
assert EXTERNAL_SHARED_DOCS_IN_FOLDER[0].split("/")[-1] in retrieved_ids
@patch( @patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key", "onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None, return_value=None,

View File

@ -281,7 +281,7 @@ export default function AddConnector({
return ( return (
<Formik <Formik
initialValues={{ initialValues={{
...createConnectorInitialValues(connector), ...createConnectorInitialValues(connector, currentCredential),
...Object.fromEntries( ...Object.fromEntries(
connectorConfigs[connector].advanced_values.map((field) => [ connectorConfigs[connector].advanced_values.map((field) => [
field.name, field.name,

View File

@ -1292,7 +1292,8 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
}, },
}; };
export function createConnectorInitialValues( export function createConnectorInitialValues(
connector: ConfigurableSources connector: ConfigurableSources,
currentCredential: Credential<any> | null = null
): Record<string, any> & AccessTypeGroupSelectorFormType { ): Record<string, any> & AccessTypeGroupSelectorFormType {
const configuration = connectorConfigs[connector]; const configuration = connectorConfigs[connector];
@ -1307,7 +1308,16 @@ export function createConnectorInitialValues(
} else if (field.type === "list") { } else if (field.type === "list") {
acc[field.name] = field.default || []; acc[field.name] = field.default || [];
} else if (field.type === "checkbox") { } else if (field.type === "checkbox") {
// Special case for include_files_shared_with_me when using service account
if (
field.name === "include_files_shared_with_me" &&
currentCredential &&
!currentCredential.credential_json?.google_tokens
) {
acc[field.name] = true;
} else {
acc[field.name] = field.default || false; acc[field.name] = field.default || false;
}
} else if (field.default !== undefined) { } else if (field.default !== undefined) {
acc[field.name] = field.default; acc[field.name] = field.default;
} }