Content of .xlsl are not properly read during indexing. (#4035)

This commit is contained in:
Kaveen Jayamanna
2025-02-26 00:10:47 -05:00
committed by GitHub
parent ce2b4de245
commit 0e42ae9024
2 changed files with 65 additions and 2 deletions

View File

@@ -1,7 +1,9 @@
import io import io
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from tempfile import NamedTemporaryFile
import openpyxl # type: ignore
from googleapiclient.discovery import build # type: ignore from googleapiclient.discovery import build # type: ignore
from googleapiclient.errors import HttpError # type: ignore from googleapiclient.errors import HttpError # type: ignore
@@ -43,12 +45,15 @@ def _extract_sections_basic(
) -> list[Section]: ) -> list[Section]:
mime_type = file["mimeType"] mime_type = file["mimeType"]
link = file["webViewLink"] link = file["webViewLink"]
supported_file_types = set(item.value for item in GDriveMimeType)
if mime_type not in set(item.value for item in GDriveMimeType): if mime_type not in supported_file_types:
# Unsupported file types can still have a title, finding this way is still useful # Unsupported file types can still have a title, finding this way is still useful
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
try: try:
# ---------------------------
# Google Sheets extraction
if mime_type == GDriveMimeType.SPREADSHEET.value: if mime_type == GDriveMimeType.SPREADSHEET.value:
try: try:
sheets_service = build( sheets_service = build(
@@ -109,7 +114,53 @@ def _extract_sections_basic(
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'." f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
" Falling back to basic extraction." " Falling back to basic extraction."
) )
# ---------------------------
# Microsoft Excel (.xlsx or .xls) extraction branch
elif mime_type in [
GDriveMimeType.SPREADSHEET_OPEN_FORMAT.value,
GDriveMimeType.SPREADSHEET_MS_EXCEL.value,
]:
try:
response = service.files().get_media(fileId=file["id"]).execute()
with NamedTemporaryFile(suffix=".xlsx", delete=True) as tmp:
tmp.write(response)
tmp_path = tmp.name
section_separator = "\n\n"
workbook = openpyxl.load_workbook(tmp_path, read_only=True)
# Work similarly to the xlsx_to_text function used for file connector
# but returns Sections instead of a string
sections = [
Section(
link=link,
text=(
f"Sheet: {sheet.title}\n\n"
+ section_separator.join(
",".join(map(str, row))
for row in sheet.iter_rows(
min_row=1, values_only=True
)
if row
)
),
)
for sheet in workbook.worksheets
]
return sections
except Exception as e:
logger.warning(
f"Error extracting data from Excel file '{file['name']}': {e}"
)
return [
Section(link=link, text="Error extracting data from Excel file")
]
# ---------------------------
# Export for Google Docs, PPT, and fallback for spreadsheets
if mime_type in [ if mime_type in [
GDriveMimeType.DOC.value, GDriveMimeType.DOC.value,
GDriveMimeType.PPT.value, GDriveMimeType.PPT.value,
@@ -128,6 +179,8 @@ def _extract_sections_basic(
) )
return [Section(link=link, text=text)] return [Section(link=link, text=text)]
# ---------------------------
# Plain text and Markdown files
elif mime_type in [ elif mime_type in [
GDriveMimeType.PLAIN_TEXT.value, GDriveMimeType.PLAIN_TEXT.value,
GDriveMimeType.MARKDOWN.value, GDriveMimeType.MARKDOWN.value,
@@ -141,6 +194,8 @@ def _extract_sections_basic(
.decode("utf-8"), .decode("utf-8"),
) )
] ]
# ---------------------------
# Word, PowerPoint, PDF files
if mime_type in [ if mime_type in [
GDriveMimeType.WORD_DOC.value, GDriveMimeType.WORD_DOC.value,
GDriveMimeType.POWERPOINT.value, GDriveMimeType.POWERPOINT.value,
@@ -170,7 +225,11 @@ def _extract_sections_basic(
Section(link=link, text=pptx_to_text(file=io.BytesIO(response))) Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
] ]
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] # Catch-all case, should not happen since there should be specific handling
# for each of the supported file types
error_message = f"Unsupported file type: {mime_type}"
logger.error(error_message)
raise ValueError(error_message)
except Exception: except Exception:
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]

View File

@@ -5,6 +5,10 @@ from typing import Any
class GDriveMimeType(str, Enum): class GDriveMimeType(str, Enum):
DOC = "application/vnd.google-apps.document" DOC = "application/vnd.google-apps.document"
SPREADSHEET = "application/vnd.google-apps.spreadsheet" SPREADSHEET = "application/vnd.google-apps.spreadsheet"
SPREADSHEET_OPEN_FORMAT = (
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
SPREADSHEET_MS_EXCEL = "application/vnd.ms-excel"
PDF = "application/pdf" PDF = "application/pdf"
WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
PPT = "application/vnd.google-apps.presentation" PPT = "application/vnd.google-apps.presentation"