From 0e42ae9024673291c520f889c8211ea2bf3cf1b4 Mon Sep 17 00:00:00 2001 From: Kaveen Jayamanna <63982009+ktjayamanna@users.noreply.github.com> Date: Wed, 26 Feb 2025 00:10:47 -0500 Subject: [PATCH] Content of .xlsl are not properly read during indexing. (#4035) --- .../connectors/google_drive/doc_conversion.py | 63 ++++++++++++++++++- .../onyx/connectors/google_drive/models.py | 4 ++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index fc89654a43fd..a7cbb709df5b 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -1,7 +1,9 @@ import io from datetime import datetime from datetime import timezone +from tempfile import NamedTemporaryFile +import openpyxl # type: ignore from googleapiclient.discovery import build # type: ignore from googleapiclient.errors import HttpError # type: ignore @@ -43,12 +45,15 @@ def _extract_sections_basic( ) -> list[Section]: mime_type = file["mimeType"] link = file["webViewLink"] + supported_file_types = set(item.value for item in GDriveMimeType) - if mime_type not in set(item.value for item in GDriveMimeType): + if mime_type not in supported_file_types: # Unsupported file types can still have a title, finding this way is still useful return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] try: + # --------------------------- + # Google Sheets extraction if mime_type == GDriveMimeType.SPREADSHEET.value: try: sheets_service = build( @@ -109,7 +114,53 @@ def _extract_sections_basic( f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'." " Falling back to basic extraction." ) + # --------------------------- + # Microsoft Excel (.xlsx or .xls) extraction branch + elif mime_type in [ + GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value, + GDriveMimeType.SPREADSHEET_MS_EXCEL.value, + ]: + try: + response = service.files().get_media(fileId=file["id"]).execute() + with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp: + tmp.write(response) + tmp_path = tmp.name + + section_separator = "\n\n" + workbook = openpyxl.load_workbook(tmp_path, read_only=True) + + # Work similarly to the xlsx_to_text function used for file connector + # but returns Sections instead of a string + sections = [ + Section( + link=link, + text=( + f"Sheet: {sheet.title}\n\n" + + section_separator.join( + ",".join(map(str, row)) + for row in sheet.iter_rows( + min_row=1, values_only=True + ) + if row + ) + ), + ) + for sheet in workbook.worksheets + ] + + return sections + + except Exception as e: + logger.warning( + f"Error extracting data from Excel file '{file['name']}': {e}" + ) + return [ + Section(link=link, text="Error extracting data from Excel file") + ] + + # --------------------------- + # Export for Google Docs, PPT, and fallback for spreadsheets if mime_type in [ GDriveMimeType.DOC.value, GDriveMimeType.PPT.value, @@ -128,6 +179,8 @@ def _extract_sections_basic( ) return [Section(link=link, text=text)] + # --------------------------- + # Plain text and Markdown files elif mime_type in [ GDriveMimeType.PLAIN_TEXT.value, GDriveMimeType.MARKDOWN.value, @@ -141,6 +194,8 @@ def _extract_sections_basic( .decode("utf-8"), ) ] + # --------------------------- + # Word, PowerPoint, PDF files if mime_type in [ GDriveMimeType.WORD_DOC.value, GDriveMimeType.POWERPOINT.value, @@ -170,7 +225,11 @@ def _extract_sections_basic( Section(link=link, text=pptx_to_text(file=io.BytesIO(response))) ] - return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + # Catch-all case, should not happen since there should be specific handling + # for each of the supported file types + error_message = f"Unsupported file type: {mime_type}" + logger.error(error_message) + raise ValueError(error_message) except Exception: return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] diff --git a/backend/onyx/connectors/google_drive/models.py b/backend/onyx/connectors/google_drive/models.py index 5bb06f3c2061..7cf32450a507 100644 --- a/backend/onyx/connectors/google_drive/models.py +++ b/backend/onyx/connectors/google_drive/models.py @@ -5,6 +5,10 @@ from typing import Any class GDriveMimeType(str, Enum): DOC = "application/vnd.google-apps.document" SPREADSHEET = "application/vnd.google-apps.spreadsheet" + SPREADSHEET_OPEN_FORMAT = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + SPREADSHEET_MS_EXCEL = "application/vnd.ms-excel" PDF = "application/pdf" WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" PPT = "application/vnd.google-apps.presentation"