Content of .xlsl are not properly read during indexing. (#4035)

2025-09-18 19:43:26 +02:00 · 2025-02-26 00:10:47 -05:00
parent ce2b4de245
commit 0e42ae9024
2 changed files with 65 additions and 2 deletions
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -1,7 +1,9 @@
 import io
 from datetime import datetime
 from datetime import timezone
 from tempfile import NamedTemporaryFile
 import openpyxl  # type: ignore
 from googleapiclient.discovery import build  # type: ignore
 from googleapiclient.errors import HttpError  # type: ignore
@@ -43,12 +45,15 @@ def _extract_sections_basic(
 ) -> list[Section]:
    mime_type = file["mimeType"]
    link = file["webViewLink"]
    supported_file_types = set(item.value for item in GDriveMimeType)
-    if mime_type not in set(item.value for item in GDriveMimeType):
+    if mime_type not in supported_file_types:
        # Unsupported file types can still have a title, finding this way is still useful
        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
    try:
        # ---------------------------
        # Google Sheets extraction
        if mime_type == GDriveMimeType.SPREADSHEET.value:
            try:
                sheets_service = build(
@@ -109,7 +114,53 @@ def _extract_sections_basic(
                    f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
                    " Falling back to basic extraction."
                )
        # ---------------------------
        # Microsoft Excel (.xlsx or .xls) extraction branch
        elif mime_type in [
            GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value,
            GDriveMimeType.SPREADSHEET_MS_EXCEL.value,
        ]:
            try:
                response = service.files().get_media(fileId=file["id"]).execute()
                with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp:
                    tmp.write(response)
                    tmp_path = tmp.name
                    section_separator = "\n\n"
                    workbook = openpyxl.load_workbook(tmp_path, read_only=True)
                    # Work similarly to the xlsx_to_text function used for file connector
                    # but returns Sections instead of a string
                    sections = [
                        Section(
                            link=link,
                            text=(
                                f"Sheet: {sheet.title}\n\n"
                                + section_separator.join(
                                    ",".join(map(str, row))
                                    for row in sheet.iter_rows(
                                        min_row=1, values_only=True
                                    )
                                    if row
                                )
                            ),
                        )
                        for sheet in workbook.worksheets
                    ]
                return sections
            except Exception as e:
                logger.warning(
                    f"Error extracting data from Excel file '{file['name']}': {e}"
                )
                return [
                    Section(link=link, text="Error extracting data from Excel file")
                ]
        # ---------------------------
        # Export for Google Docs, PPT, and fallback for spreadsheets
        if mime_type in [
            GDriveMimeType.DOC.value,
            GDriveMimeType.PPT.value,
@@ -128,6 +179,8 @@ def _extract_sections_basic(
            )
            return [Section(link=link, text=text)]
        # ---------------------------
        # Plain text and Markdown files
        elif mime_type in [
            GDriveMimeType.PLAIN_TEXT.value,
            GDriveMimeType.MARKDOWN.value,
@@ -141,6 +194,8 @@ def _extract_sections_basic(
                    .decode("utf-8"),
                )
            ]
        # ---------------------------
        # Word, PowerPoint, PDF files
        if mime_type in [
            GDriveMimeType.WORD_DOC.value,
            GDriveMimeType.POWERPOINT.value,
@@ -170,7 +225,11 @@ def _extract_sections_basic(
                    Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
                ]
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+        # Catch-all case, should not happen since there should be specific handling
        # for each of the supported file types
        error_message = f"Unsupported file type: {mime_type}"
        logger.error(error_message)
        raise ValueError(error_message)
    except Exception:
        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
--- a/backend/onyx/connectors/google_drive/models.py
+++ b/backend/onyx/connectors/google_drive/models.py
@@ -5,6 +5,10 @@ from typing import Any
 class GDriveMimeType(str, Enum):
    DOC = "application/vnd.google-apps.document"
    SPREADSHEET = "application/vnd.google-apps.spreadsheet"
    SPREADSHEET_OPEN_FORMAT = (
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )
    SPREADSHEET_MS_EXCEL = "application/vnd.ms-excel"
    PDF = "application/pdf"
    WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    PPT = "application/vnd.google-apps.presentation"