welcome to onyx

2025-09-19 20:24:32 +02:00 · 2024-12-13 09:48:43 -08:00
parent 54dcbfa288
commit 21ec5ed795
813 changed files with 7021 additions and 6824 deletions
--- a/backend/onyx/connectors/google_drive/init.py
+++ b/backend/onyx/connectors/google_drive/init.py
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -0,0 +1,547 @@
+from collections.abc import Callable
+from collections.abc import Iterator
+from concurrent.futures import as_completed
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any
+from typing import cast
+
+from google.oauth2.credentials import Credentials as OAuthCredentials  # type: ignore
+from google.oauth2.service_account import Credentials as ServiceAccountCredentials  # type: ignore
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.app_configs import MAX_FILE_SIZE_BYTES
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.google_drive.doc_conversion import build_slim_document
+from onyx.connectors.google_drive.doc_conversion import (
+    convert_drive_item_to_document,
+)
+from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
+from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
+from onyx.connectors.google_drive.file_retrieval import get_all_files_in_my_drive
+from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
+from onyx.connectors.google_drive.models import GoogleDriveFileType
+from onyx.connectors.google_utils.google_auth import get_google_creds
+from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
+from onyx.connectors.google_utils.resources import get_admin_service
+from onyx.connectors.google_utils.resources import get_drive_service
+from onyx.connectors.google_utils.resources import get_google_docs_service
+from onyx.connectors.google_utils.shared_constants import (
+    DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
+)
+from onyx.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR
+from onyx.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
+from onyx.connectors.google_utils.shared_constants import SCOPE_DOC_URL
+from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
+from onyx.connectors.google_utils.shared_constants import USER_FIELDS
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+# TODO: Improve this by using the batch utility: https://googleapis.github.io/google-api-python-client/docs/batch.html
+# All file retrievals could be batched and made at once
+
+
+def _extract_str_list_from_comma_str(string: str | None) -> list[str]:
+    if not string:
+        return []
+    return [s.strip() for s in string.split(",") if s.strip()]
+
+
+def _extract_ids_from_urls(urls: list[str]) -> list[str]:
+    return [url.split("/")[-1] for url in urls]
+
+
+def _convert_single_file(
+    creds: Any, primary_admin_email: str, file: dict[str, Any]
+) -> Any:
+    user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
+    user_drive_service = get_drive_service(creds, user_email=user_email)
+    docs_service = get_google_docs_service(creds, user_email=user_email)
+    return convert_drive_item_to_document(
+        file=file,
+        drive_service=user_drive_service,
+        docs_service=docs_service,
+    )
+
+
+def _process_files_batch(
+    files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int
+) -> GenerateDocumentsOutput:
+    doc_batch = []
+    with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor:
+        for doc in executor.map(convert_func, files):
+            if doc:
+                doc_batch.append(doc)
+                if len(doc_batch) >= batch_size:
+                    yield doc_batch
+                    doc_batch = []
+    if doc_batch:
+        yield doc_batch
+
+
+def _clean_requested_drive_ids(
+    requested_drive_ids: set[str],
+    requested_folder_ids: set[str],
+    all_drive_ids_available: set[str],
+) -> tuple[set[str], set[str]]:
+    invalid_requested_drive_ids = requested_drive_ids - all_drive_ids_available
+    filtered_folder_ids = requested_folder_ids - all_drive_ids_available
+    if invalid_requested_drive_ids:
+        logger.warning(
+            f"Some shared drive IDs were not found. IDs: {invalid_requested_drive_ids}"
+        )
+        logger.warning("Checking for folder access instead...")
+        filtered_folder_ids.update(invalid_requested_drive_ids)
+
+    valid_requested_drive_ids = requested_drive_ids - invalid_requested_drive_ids
+    return valid_requested_drive_ids, filtered_folder_ids
+
+
+class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
+    def __init__(
+        self,
+        include_shared_drives: bool = False,
+        include_my_drives: bool = False,
+        include_files_shared_with_me: bool = False,
+        shared_drive_urls: str | None = None,
+        my_drive_emails: str | None = None,
+        shared_folder_urls: str | None = None,
+        batch_size: int = INDEX_BATCH_SIZE,
+        # OLD PARAMETERS
+        folder_paths: list[str] | None = None,
+        include_shared: bool | None = None,
+        follow_shortcuts: bool | None = None,
+        only_org_public: bool | None = None,
+        continue_on_failure: bool | None = None,
+    ) -> None:
+        # Check for old input parameters
+        if (
+            folder_paths is not None
+            or include_shared is not None
+            or follow_shortcuts is not None
+            or only_org_public is not None
+            or continue_on_failure is not None
+        ):
+            logger.exception(
+                "Google Drive connector received old input parameters. "
+                "Please visit the docs for help with the new setup: "
+                f"{SCOPE_DOC_URL}"
+            )
+            raise ValueError(
+                "Google Drive connector received old input parameters. "
+                "Please visit the docs for help with the new setup: "
+                f"{SCOPE_DOC_URL}"
+            )
+
+        if (
+            not include_shared_drives
+            and not include_my_drives
+            and not include_files_shared_with_me
+            and not shared_folder_urls
+            and not my_drive_emails
+            and not shared_drive_urls
+        ):
+            raise ValueError(
+                "Nothing to index. Please specify at least one of the following: "
+                "include_shared_drives, include_my_drives, include_files_shared_with_me, "
+                "shared_folder_urls, or my_drive_emails"
+            )
+
+        self.batch_size = batch_size
+
+        specific_requests_made = False
+        if bool(shared_drive_urls) or bool(my_drive_emails) or bool(shared_folder_urls):
+            specific_requests_made = True
+
+        self.include_files_shared_with_me = (
+            False if specific_requests_made else include_files_shared_with_me
+        )
+        self.include_my_drives = False if specific_requests_made else include_my_drives
+        self.include_shared_drives = (
+            False if specific_requests_made else include_shared_drives
+        )
+
+        shared_drive_url_list = _extract_str_list_from_comma_str(shared_drive_urls)
+        self._requested_shared_drive_ids = set(
+            _extract_ids_from_urls(shared_drive_url_list)
+        )
+
+        self._requested_my_drive_emails = set(
+            _extract_str_list_from_comma_str(my_drive_emails)
+        )
+
+        shared_folder_url_list = _extract_str_list_from_comma_str(shared_folder_urls)
+        self._requested_folder_ids = set(_extract_ids_from_urls(shared_folder_url_list))
+
+        self._primary_admin_email: str | None = None
+
+        self._creds: OAuthCredentials | ServiceAccountCredentials | None = None
+
+        self._retrieved_ids: set[str] = set()
+
+    @property
+    def primary_admin_email(self) -> str:
+        if self._primary_admin_email is None:
+            raise RuntimeError(
+                "Primary admin email missing, "
+                "should not call this property "
+                "before calling load_credentials"
+            )
+        return self._primary_admin_email
+
+    @property
+    def google_domain(self) -> str:
+        if self._primary_admin_email is None:
+            raise RuntimeError(
+                "Primary admin email missing, "
+                "should not call this property "
+                "before calling load_credentials"
+            )
+        return self._primary_admin_email.split("@")[-1]
+
+    @property
+    def creds(self) -> OAuthCredentials | ServiceAccountCredentials:
+        if self._creds is None:
+            raise RuntimeError(
+                "Creds missing, "
+                "should not call this property "
+                "before calling load_credentials"
+            )
+        return self._creds
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None:
+        self._primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY]
+
+        self._creds, new_creds_dict = get_google_creds(
+            credentials=credentials,
+            source=DocumentSource.GOOGLE_DRIVE,
+        )
+        return new_creds_dict
+
+    def _update_traversed_parent_ids(self, folder_id: str) -> None:
+        self._retrieved_ids.add(folder_id)
+
+    def _get_all_user_emails(self) -> list[str]:
+        # Start with primary admin email
+        user_emails = [self.primary_admin_email]
+
+        # Only fetch additional users if using service account
+        if isinstance(self.creds, OAuthCredentials):
+            return user_emails
+
+        admin_service = get_admin_service(
+            creds=self.creds,
+            user_email=self.primary_admin_email,
+        )
+
+        # Get admins first since they're more likely to have access to most files
+        for is_admin in [True, False]:
+            query = "isAdmin=true" if is_admin else "isAdmin=false"
+            for user in execute_paginated_retrieval(
+                retrieval_function=admin_service.users().list,
+                list_key="users",
+                fields=USER_FIELDS,
+                domain=self.google_domain,
+                query=query,
+            ):
+                if email := user.get("primaryEmail"):
+                    if email not in user_emails:
+                        user_emails.append(email)
+        return user_emails
+
+    def _get_all_drive_ids(self) -> set[str]:
+        primary_drive_service = get_drive_service(
+            creds=self.creds,
+            user_email=self.primary_admin_email,
+        )
+        is_service_account = isinstance(self.creds, ServiceAccountCredentials)
+        all_drive_ids = set()
+        for drive in execute_paginated_retrieval(
+            retrieval_function=primary_drive_service.drives().list,
+            list_key="drives",
+            useDomainAdminAccess=is_service_account,
+            fields="drives(id)",
+        ):
+            all_drive_ids.add(drive["id"])
+
+        if not all_drive_ids:
+            logger.warning(
+                "No drives found even though we are indexing shared drives was requested."
+            )
+
+        return all_drive_ids
+
+    def _impersonate_user_for_retrieval(
+        self,
+        user_email: str,
+        is_slim: bool,
+        filtered_drive_ids: set[str],
+        filtered_folder_ids: set[str],
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> Iterator[GoogleDriveFileType]:
+        drive_service = get_drive_service(self.creds, user_email)
+
+        # if we are including my drives, try to get the current user's my
+        # drive if any of the following are true:
+        # - include_my_drives is true
+        # - the current user's email is in the requested emails
+        if self.include_my_drives or user_email in self._requested_my_drive_emails:
+            yield from get_all_files_in_my_drive(
+                service=drive_service,
+                update_traversed_ids_func=self._update_traversed_parent_ids,
+                is_slim=is_slim,
+                start=start,
+                end=end,
+            )
+
+        remaining_drive_ids = filtered_drive_ids - self._retrieved_ids
+        for drive_id in remaining_drive_ids:
+            yield from get_files_in_shared_drive(
+                service=drive_service,
+                drive_id=drive_id,
+                is_slim=is_slim,
+                update_traversed_ids_func=self._update_traversed_parent_ids,
+                start=start,
+                end=end,
+            )
+
+        remaining_folders = filtered_folder_ids - self._retrieved_ids
+        for folder_id in remaining_folders:
+            yield from crawl_folders_for_files(
+                service=drive_service,
+                parent_id=folder_id,
+                traversed_parent_ids=self._retrieved_ids,
+                update_traversed_ids_func=self._update_traversed_parent_ids,
+                start=start,
+                end=end,
+            )
+
+    def _manage_service_account_retrieval(
+        self,
+        is_slim: bool,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> Iterator[GoogleDriveFileType]:
+        all_org_emails: list[str] = self._get_all_user_emails()
+
+        all_drive_ids: set[str] = self._get_all_drive_ids()
+
+        drive_ids_to_retrieve: set[str] = set()
+        folder_ids_to_retrieve: set[str] = set()
+        if self._requested_shared_drive_ids or self._requested_folder_ids:
+            drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids(
+                requested_drive_ids=self._requested_shared_drive_ids,
+                requested_folder_ids=self._requested_folder_ids,
+                all_drive_ids_available=all_drive_ids,
+            )
+        elif self.include_shared_drives:
+            drive_ids_to_retrieve = all_drive_ids
+
+        # Process users in parallel using ThreadPoolExecutor
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            future_to_email = {
+                executor.submit(
+                    self._impersonate_user_for_retrieval,
+                    email,
+                    is_slim,
+                    drive_ids_to_retrieve,
+                    folder_ids_to_retrieve,
+                    start,
+                    end,
+                ): email
+                for email in all_org_emails
+            }
+
+            # Yield results as they complete
+            for future in as_completed(future_to_email):
+                yield from future.result()
+
+        remaining_folders = (
+            drive_ids_to_retrieve | folder_ids_to_retrieve
+        ) - self._retrieved_ids
+        if remaining_folders:
+            logger.warning(
+                f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
+            )
+
+    def _manage_oauth_retrieval(
+        self,
+        is_slim: bool,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> Iterator[GoogleDriveFileType]:
+        drive_service = get_drive_service(self.creds, self.primary_admin_email)
+
+        if self.include_files_shared_with_me or self.include_my_drives:
+            yield from get_all_files_for_oauth(
+                service=drive_service,
+                include_files_shared_with_me=self.include_files_shared_with_me,
+                include_my_drives=self.include_my_drives,
+                include_shared_drives=self.include_shared_drives,
+                is_slim=is_slim,
+                start=start,
+                end=end,
+            )
+
+        all_requested = (
+            self.include_files_shared_with_me
+            and self.include_my_drives
+            and self.include_shared_drives
+        )
+        if all_requested:
+            # If all 3 are true, we already yielded from get_all_files_for_oauth
+            return
+
+        all_drive_ids = self._get_all_drive_ids()
+        drive_ids_to_retrieve: set[str] = set()
+        folder_ids_to_retrieve: set[str] = set()
+        if self._requested_shared_drive_ids or self._requested_folder_ids:
+            drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids(
+                requested_drive_ids=self._requested_shared_drive_ids,
+                requested_folder_ids=self._requested_folder_ids,
+                all_drive_ids_available=all_drive_ids,
+            )
+        elif self.include_shared_drives:
+            drive_ids_to_retrieve = all_drive_ids
+
+        for drive_id in drive_ids_to_retrieve:
+            yield from get_files_in_shared_drive(
+                service=drive_service,
+                drive_id=drive_id,
+                is_slim=is_slim,
+                update_traversed_ids_func=self._update_traversed_parent_ids,
+                start=start,
+                end=end,
+            )
+
+        # Even if no folders were requested, we still check if any drives were requested
+        # that could be folders.
+        remaining_folders = folder_ids_to_retrieve - self._retrieved_ids
+        for folder_id in remaining_folders:
+            yield from crawl_folders_for_files(
+                service=drive_service,
+                parent_id=folder_id,
+                traversed_parent_ids=self._retrieved_ids,
+                update_traversed_ids_func=self._update_traversed_parent_ids,
+                start=start,
+                end=end,
+            )
+
+        remaining_folders = (
+            drive_ids_to_retrieve | folder_ids_to_retrieve
+        ) - self._retrieved_ids
+        if remaining_folders:
+            logger.warning(
+                f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
+            )
+
+    def _fetch_drive_items(
+        self,
+        is_slim: bool,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> Iterator[GoogleDriveFileType]:
+        retrieval_method = (
+            self._manage_service_account_retrieval
+            if isinstance(self.creds, ServiceAccountCredentials)
+            else self._manage_oauth_retrieval
+        )
+        drive_files = retrieval_method(
+            is_slim=is_slim,
+            start=start,
+            end=end,
+        )
+
+        return drive_files
+
+    def _extract_docs_from_google_drive(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> GenerateDocumentsOutput:
+        # Create a larger process pool for file conversion
+        convert_func = partial(
+            _convert_single_file, self.creds, self.primary_admin_email
+        )
+
+        # Process files in larger batches
+        LARGE_BATCH_SIZE = self.batch_size * 4
+        files_to_process = []
+        # Gather the files into batches to be processed in parallel
+        for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
+            if (
+                file.get("size")
+                and int(cast(str, file.get("size"))) > MAX_FILE_SIZE_BYTES
+            ):
+                logger.warning(
+                    f"Skipping file {file.get('name', 'Unknown')} as it is too large: {file.get('size')} bytes"
+                )
+                continue
+
+            files_to_process.append(file)
+            if len(files_to_process) >= LARGE_BATCH_SIZE:
+                yield from _process_files_batch(
+                    files_to_process, convert_func, self.batch_size
+                )
+                files_to_process = []
+
+        # Process any remaining files
+        if files_to_process:
+            yield from _process_files_batch(
+                files_to_process, convert_func, self.batch_size
+            )
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        try:
+            yield from self._extract_docs_from_google_drive()
+        except Exception as e:
+            if MISSING_SCOPES_ERROR_STR in str(e):
+                raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
+            raise e
+
+    def poll_source(
+        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
+    ) -> GenerateDocumentsOutput:
+        try:
+            yield from self._extract_docs_from_google_drive(start, end)
+        except Exception as e:
+            if MISSING_SCOPES_ERROR_STR in str(e):
+                raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
+            raise e
+
+    def _extract_slim_docs_from_google_drive(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> GenerateSlimDocumentOutput:
+        slim_batch = []
+        for file in self._fetch_drive_items(
+            is_slim=True,
+            start=start,
+            end=end,
+        ):
+            if doc := build_slim_document(file):
+                slim_batch.append(doc)
+            if len(slim_batch) >= SLIM_BATCH_SIZE:
+                yield slim_batch
+                slim_batch = []
+        yield slim_batch
+
+    def retrieve_all_slim_documents(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+    ) -> GenerateSlimDocumentOutput:
+        try:
+            yield from self._extract_slim_docs_from_google_drive(start, end)
+        except Exception as e:
+            if MISSING_SCOPES_ERROR_STR in str(e):
+                raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
+            raise e
--- a/backend/onyx/connectors/google_drive/constants.py
+++ b/backend/onyx/connectors/google_drive/constants.py
@@ -0,0 +1,4 @@
+UNSUPPORTED_FILE_TYPE_CONTENT = ""  # keep empty for now
+DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
+DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut"
+DRIVE_FILE_TYPE = "application/vnd.google-apps.file"
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -0,0 +1,260 @@
+import io
+from datetime import datetime
+from datetime import timezone
+
+from googleapiclient.discovery import build  # type: ignore
+from googleapiclient.errors import HttpError  # type: ignore
+
+from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
+from onyx.configs.constants import DocumentSource
+from onyx.configs.constants import IGNORE_FOR_QA
+from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
+from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
+from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
+from onyx.connectors.google_drive.models import GDriveMimeType
+from onyx.connectors.google_drive.models import GoogleDriveFileType
+from onyx.connectors.google_drive.section_extraction import get_document_sections
+from onyx.connectors.google_utils.resources import GoogleDocsService
+from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.models import Document
+from onyx.connectors.models import Section
+from onyx.connectors.models import SlimDocument
+from onyx.file_processing.extract_file_text import docx_to_text
+from onyx.file_processing.extract_file_text import pptx_to_text
+from onyx.file_processing.extract_file_text import read_pdf_file
+from onyx.file_processing.unstructured import get_unstructured_api_key
+from onyx.file_processing.unstructured import unstructured_to_text
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# these errors don't represent a failure in the connector, but simply files
+# that can't / shouldn't be indexed
+ERRORS_TO_CONTINUE_ON = [
+    "cannotExportFile",
+    "exportSizeLimitExceeded",
+    "cannotDownloadFile",
+]
+
+
+def _extract_sections_basic(
+    file: dict[str, str], service: GoogleDriveService
+) -> list[Section]:
+    mime_type = file["mimeType"]
+    link = file["webViewLink"]
+
+    if mime_type not in set(item.value for item in GDriveMimeType):
+        # Unsupported file types can still have a title, finding this way is still useful
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+    try:
+        if mime_type == GDriveMimeType.SPREADSHEET.value:
+            try:
+                sheets_service = build(
+                    "sheets", "v4", credentials=service._http.credentials
+                )
+                spreadsheet = (
+                    sheets_service.spreadsheets()
+                    .get(spreadsheetId=file["id"])
+                    .execute()
+                )
+
+                sections = []
+                for sheet in spreadsheet["sheets"]:
+                    sheet_name = sheet["properties"]["title"]
+                    sheet_id = sheet["properties"]["sheetId"]
+
+                    # Get sheet dimensions
+                    grid_properties = sheet["properties"].get("gridProperties", {})
+                    row_count = grid_properties.get("rowCount", 1000)
+                    column_count = grid_properties.get("columnCount", 26)
+
+                    # Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
+                    end_column = ""
+                    while column_count:
+                        column_count, remainder = divmod(column_count - 1, 26)
+                        end_column = chr(65 + remainder) + end_column
+
+                    range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
+
+                    try:
+                        result = (
+                            sheets_service.spreadsheets()
+                            .values()
+                            .get(spreadsheetId=file["id"], range=range_name)
+                            .execute()
+                        )
+                        values = result.get("values", [])
+
+                        if values:
+                            text = f"Sheet: {sheet_name}\n"
+                            for row in values:
+                                text += "\t".join(str(cell) for cell in row) + "\n"
+                            sections.append(
+                                Section(
+                                    link=f"{link}#gid={sheet_id}",
+                                    text=text,
+                                )
+                            )
+                    except HttpError as e:
+                        logger.warning(
+                            f"Error fetching data for sheet '{sheet_name}': {e}"
+                        )
+                        continue
+                return sections
+
+            except Exception as e:
+                logger.warning(
+                    f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
+                    " Falling back to basic extraction."
+                )
+
+        if mime_type in [
+            GDriveMimeType.DOC.value,
+            GDriveMimeType.PPT.value,
+            GDriveMimeType.SPREADSHEET.value,
+        ]:
+            export_mime_type = (
+                "text/plain"
+                if mime_type != GDriveMimeType.SPREADSHEET.value
+                else "text/csv"
+            )
+            text = (
+                service.files()
+                .export(fileId=file["id"], mimeType=export_mime_type)
+                .execute()
+                .decode("utf-8")
+            )
+            return [Section(link=link, text=text)]
+
+        elif mime_type in [
+            GDriveMimeType.PLAIN_TEXT.value,
+            GDriveMimeType.MARKDOWN.value,
+        ]:
+            return [
+                Section(
+                    link=link,
+                    text=service.files()
+                    .get_media(fileId=file["id"])
+                    .execute()
+                    .decode("utf-8"),
+                )
+            ]
+        if mime_type in [
+            GDriveMimeType.WORD_DOC.value,
+            GDriveMimeType.POWERPOINT.value,
+            GDriveMimeType.PDF.value,
+        ]:
+            response = service.files().get_media(fileId=file["id"]).execute()
+            if get_unstructured_api_key():
+                return [
+                    Section(
+                        link=link,
+                        text=unstructured_to_text(
+                            file=io.BytesIO(response),
+                            file_name=file.get("name", file["id"]),
+                        ),
+                    )
+                ]
+
+            if mime_type == GDriveMimeType.WORD_DOC.value:
+                return [
+                    Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
+                ]
+            elif mime_type == GDriveMimeType.PDF.value:
+                text, _ = read_pdf_file(file=io.BytesIO(response))
+                return [Section(link=link, text=text)]
+            elif mime_type == GDriveMimeType.POWERPOINT.value:
+                return [
+                    Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
+                ]
+
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+    except Exception:
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+
+def convert_drive_item_to_document(
+    file: GoogleDriveFileType,
+    drive_service: GoogleDriveService,
+    docs_service: GoogleDocsService,
+) -> Document | None:
+    try:
+        # Skip files that are shortcuts
+        if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
+            logger.info("Ignoring Drive Shortcut Filetype")
+            return None
+        # Skip files that are folders
+        if file.get("mimeType") == DRIVE_FOLDER_TYPE:
+            logger.info("Ignoring Drive Folder Filetype")
+            return None
+
+        sections: list[Section] = []
+
+        # Special handling for Google Docs to preserve structure, link
+        # to headers
+        if file.get("mimeType") == GDriveMimeType.DOC.value:
+            try:
+                sections = get_document_sections(docs_service, file["id"])
+            except Exception as e:
+                logger.warning(
+                    f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
+                    " Falling back to basic extraction."
+                )
+        # NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
+        if not sections:
+            try:
+                # For all other file types just extract the text
+                sections = _extract_sections_basic(file, drive_service)
+
+            except HttpError as e:
+                reason = e.error_details[0]["reason"] if e.error_details else e.reason
+                message = e.error_details[0]["message"] if e.error_details else e.reason
+                if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON:
+                    logger.warning(
+                        f"Could not export file '{file['name']}' due to '{message}', skipping..."
+                    )
+                    return None
+
+                raise
+        if not sections:
+            return None
+
+        return Document(
+            id=file["webViewLink"],
+            sections=sections,
+            source=DocumentSource.GOOGLE_DRIVE,
+            semantic_identifier=file["name"],
+            doc_updated_at=datetime.fromisoformat(file["modifiedTime"]).astimezone(
+                timezone.utc
+            ),
+            metadata={}
+            if any(section.text for section in sections)
+            else {IGNORE_FOR_QA: "True"},
+            additional_info=file.get("id"),
+        )
+    except Exception as e:
+        if not CONTINUE_ON_CONNECTOR_FAILURE:
+            raise e
+
+        logger.exception("Ran into exception when pulling a file from Google Drive")
+    return None
+
+
+def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
+    # Skip files that are folders or shortcuts
+    if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
+        return None
+
+    return SlimDocument(
+        id=file["webViewLink"],
+        perm_sync_data={
+            "doc_id": file.get("id"),
+            "permissions": file.get("permissions", []),
+            "permission_ids": file.get("permissionIds", []),
+            "name": file.get("name"),
+            "owner_email": file.get("owners", [{}])[0].get("emailAddress"),
+        },
+    )
--- a/backend/onyx/connectors/google_drive/file_retrieval.py
+++ b/backend/onyx/connectors/google_drive/file_retrieval.py
@@ -0,0 +1,258 @@
+from collections.abc import Callable
+from collections.abc import Iterator
+from datetime import datetime
+from typing import Any
+
+from googleapiclient.discovery import Resource  # type: ignore
+
+from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
+from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
+from onyx.connectors.google_drive.models import GoogleDriveFileType
+from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+FILE_FIELDS = (
+    "nextPageToken, files(mimeType, id, name, permissions, modifiedTime, webViewLink, "
+    "shortcutDetails, owners(emailAddress), size)"
+)
+SLIM_FILE_FIELDS = (
+    "nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), "
+    "permissionIds, webViewLink, owners(emailAddress))"
+)
+FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
+
+
+def _generate_time_range_filter(
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> str:
+    time_range_filter = ""
+    if start is not None:
+        time_start = datetime.utcfromtimestamp(start).isoformat() + "Z"
+        time_range_filter += f" and modifiedTime >= '{time_start}'"
+    if end is not None:
+        time_stop = datetime.utcfromtimestamp(end).isoformat() + "Z"
+        time_range_filter += f" and modifiedTime <= '{time_stop}'"
+    return time_range_filter
+
+
+def _get_folders_in_parent(
+    service: Resource,
+    parent_id: str | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    # Follow shortcuts to folders
+    query = f"(mimeType = '{DRIVE_FOLDER_TYPE}' or mimeType = '{DRIVE_SHORTCUT_TYPE}')"
+    query += " and trashed = false"
+
+    if parent_id:
+        query += f" and '{parent_id}' in parents"
+
+    for file in execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        continue_on_404_or_403=True,
+        corpora="allDrives",
+        supportsAllDrives=True,
+        includeItemsFromAllDrives=True,
+        fields=FOLDER_FIELDS,
+        q=query,
+    ):
+        yield file
+
+
+def _get_files_in_parent(
+    service: Resource,
+    parent_id: str,
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+    is_slim: bool = False,
+) -> Iterator[GoogleDriveFileType]:
+    query = f"mimeType != '{DRIVE_FOLDER_TYPE}' and '{parent_id}' in parents"
+    query += " and trashed = false"
+    query += _generate_time_range_filter(start, end)
+
+    for file in execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        continue_on_404_or_403=True,
+        corpora="allDrives",
+        supportsAllDrives=True,
+        includeItemsFromAllDrives=True,
+        fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
+        q=query,
+    ):
+        yield file
+
+
+def crawl_folders_for_files(
+    service: Resource,
+    parent_id: str,
+    traversed_parent_ids: set[str],
+    update_traversed_ids_func: Callable[[str], None],
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    """
+    This function starts crawling from any folder. It is slower though.
+    """
+    if parent_id in traversed_parent_ids:
+        logger.info(f"Skipping subfolder since already traversed: {parent_id}")
+        return
+
+    found_files = False
+    for file in _get_files_in_parent(
+        service=service,
+        start=start,
+        end=end,
+        parent_id=parent_id,
+    ):
+        found_files = True
+        yield file
+
+    if found_files:
+        update_traversed_ids_func(parent_id)
+
+    for subfolder in _get_folders_in_parent(
+        service=service,
+        parent_id=parent_id,
+    ):
+        logger.info("Fetching all files in subfolder: " + subfolder["name"])
+        yield from crawl_folders_for_files(
+            service=service,
+            parent_id=subfolder["id"],
+            traversed_parent_ids=traversed_parent_ids,
+            update_traversed_ids_func=update_traversed_ids_func,
+            start=start,
+            end=end,
+        )
+
+
+def get_files_in_shared_drive(
+    service: Resource,
+    drive_id: str,
+    is_slim: bool = False,
+    update_traversed_ids_func: Callable[[str], None] = lambda _: None,
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    # If we know we are going to folder crawl later, we can cache the folders here
+    # Get all folders being queried and add them to the traversed set
+    folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
+    folder_query += " and trashed = false"
+    found_folders = False
+    for file in execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        continue_on_404_or_403=True,
+        corpora="drive",
+        driveId=drive_id,
+        supportsAllDrives=True,
+        includeItemsFromAllDrives=True,
+        fields="nextPageToken, files(id)",
+        q=folder_query,
+    ):
+        update_traversed_ids_func(file["id"])
+        found_folders = True
+    if found_folders:
+        update_traversed_ids_func(drive_id)
+
+    # Get all files in the shared drive
+    file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
+    file_query += " and trashed = false"
+    file_query += _generate_time_range_filter(start, end)
+    yield from execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        continue_on_404_or_403=True,
+        corpora="drive",
+        driveId=drive_id,
+        supportsAllDrives=True,
+        includeItemsFromAllDrives=True,
+        fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
+        q=file_query,
+    )
+
+
+def get_all_files_in_my_drive(
+    service: Any,
+    update_traversed_ids_func: Callable,
+    is_slim: bool = False,
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    # If we know we are going to folder crawl later, we can cache the folders here
+    # Get all folders being queried and add them to the traversed set
+    folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
+    folder_query += " and trashed = false"
+    folder_query += " and 'me' in owners"
+    found_folders = False
+    for file in execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        corpora="user",
+        fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
+        q=folder_query,
+    ):
+        update_traversed_ids_func(file["id"])
+        found_folders = True
+    if found_folders:
+        update_traversed_ids_func(get_root_folder_id(service))
+
+    # Then get the files
+    file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
+    file_query += " and trashed = false"
+    file_query += " and 'me' in owners"
+    file_query += _generate_time_range_filter(start, end)
+    yield from execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        corpora="user",
+        fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
+        q=file_query,
+    )
+
+
+def get_all_files_for_oauth(
+    service: Any,
+    include_files_shared_with_me: bool,
+    include_my_drives: bool,
+    # One of the above 2 should be true
+    include_shared_drives: bool,
+    is_slim: bool = False,
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    should_get_all = (
+        include_shared_drives and include_my_drives and include_files_shared_with_me
+    )
+    corpora = "allDrives" if should_get_all else "user"
+
+    file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
+    file_query += " and trashed = false"
+    file_query += _generate_time_range_filter(start, end)
+
+    if not should_get_all:
+        if include_files_shared_with_me and not include_my_drives:
+            file_query += " and not 'me' in owners"
+        if not include_files_shared_with_me and include_my_drives:
+            file_query += " and 'me' in owners"
+
+    yield from execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        corpora=corpora,
+        includeItemsFromAllDrives=should_get_all,
+        supportsAllDrives=should_get_all,
+        fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
+        q=file_query,
+    )
+
+
+# Just in case we need to get the root folder id
+def get_root_folder_id(service: Resource) -> str:
+    # we dont paginate here because there is only one root folder per user
+    # https://developers.google.com/drive/api/guides/v2-to-v3-reference
+    return service.files().get(fileId="root", fields="id").execute()["id"]
--- a/backend/onyx/connectors/google_drive/models.py
+++ b/backend/onyx/connectors/google_drive/models.py
@@ -0,0 +1,18 @@
+from enum import Enum
+from typing import Any
+
+
+class GDriveMimeType(str, Enum):
+    DOC = "application/vnd.google-apps.document"
+    SPREADSHEET = "application/vnd.google-apps.spreadsheet"
+    PDF = "application/pdf"
+    WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    PPT = "application/vnd.google-apps.presentation"
+    POWERPOINT = (
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+    )
+    PLAIN_TEXT = "text/plain"
+    MARKDOWN = "text/markdown"
+
+
+GoogleDriveFileType = dict[str, Any]
--- a/backend/onyx/connectors/google_drive/section_extraction.py
+++ b/backend/onyx/connectors/google_drive/section_extraction.py
@@ -0,0 +1,105 @@
+from typing import Any
+
+from pydantic import BaseModel
+
+from onyx.connectors.google_utils.resources import GoogleDocsService
+from onyx.connectors.models import Section
+
+
+class CurrentHeading(BaseModel):
+    id: str
+    text: str
+
+
+def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
+    """Builds a Google Doc link that jumps to a specific heading"""
+    # NOTE: doesn't support docs with multiple tabs atm, if we need that ask
+    # @Chris
+    return (
+        f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
+    )
+
+
+def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
+    """Extracts the id from a heading paragraph element"""
+    return paragraph["paragraphStyle"]["headingId"]
+
+
+def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
+    """Extracts the text content from a paragraph element"""
+    text_elements = []
+    for element in paragraph.get("elements", []):
+        if "textRun" in element:
+            text_elements.append(element["textRun"].get("content", ""))
+    return "".join(text_elements)
+
+
+def get_document_sections(
+    docs_service: GoogleDocsService,
+    doc_id: str,
+) -> list[Section]:
+    """Extracts sections from a Google Doc, including their headings and content"""
+    # Fetch the document structure
+    doc = docs_service.documents().get(documentId=doc_id).execute()
+
+    # Get the content
+    content = doc.get("body", {}).get("content", [])
+
+    sections: list[Section] = []
+    current_section: list[str] = []
+    current_heading: CurrentHeading | None = None
+
+    for element in content:
+        if "paragraph" not in element:
+            continue
+
+        paragraph = element["paragraph"]
+
+        # Check if this is a heading
+        if (
+            "paragraphStyle" in paragraph
+            and "namedStyleType" in paragraph["paragraphStyle"]
+        ):
+            style = paragraph["paragraphStyle"]["namedStyleType"]
+            is_heading = style.startswith("HEADING_")
+            is_title = style.startswith("TITLE")
+
+            if is_heading or is_title:
+                # If we were building a previous section, add it to sections list
+                if current_heading is not None and current_section:
+                    heading_text = current_heading.text
+                    section_text = f"{heading_text}\n" + "\n".join(current_section)
+                    sections.append(
+                        Section(
+                            text=section_text.strip(),
+                            link=_build_gdoc_section_link(doc_id, current_heading.id),
+                        )
+                    )
+                    current_section = []
+
+                # Start new heading
+                heading_id = _extract_id_from_heading(paragraph)
+                heading_text = _extract_text_from_paragraph(paragraph)
+                current_heading = CurrentHeading(
+                    id=heading_id,
+                    text=heading_text,
+                )
+                continue
+
+        # Add content to current section
+        if current_heading is not None:
+            text = _extract_text_from_paragraph(paragraph)
+            if text.strip():
+                current_section.append(text)
+
+    # Don't forget to add the last section
+    if current_heading is not None and current_section:
+        section_text = f"{current_heading.text}\n" + "\n".join(current_section)
+        sections.append(
+            Section(
+                text=section_text.strip(),
+                link=_build_gdoc_section_link(doc_id, current_heading.id),
+            )
+        )
+
+    return sections