From 4b0ff95b26fddee59a14e71d1a034c3bb4d5dead Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Fri, 14 Jun 2024 01:50:28 -0400 Subject: [PATCH] added pptx to drive reader (#1634) --- backend/danswer/connectors/google_drive/connector.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index c90655246..939682ef9 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -42,6 +42,7 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.file_processing.extract_file_text import docx_to_text from danswer.file_processing.extract_file_text import pdf_to_text +from danswer.file_processing.extract_file_text import pptx_to_text from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger @@ -57,6 +58,9 @@ class GDriveMimeType(str, Enum): SPREADSHEET = "application/vnd.google-apps.spreadsheet" PDF = "application/pdf" WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + POWERPOINT = ( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) GoogleDriveFileType = dict[str, Any] @@ -325,6 +329,9 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: elif mime_type == GDriveMimeType.PDF.value: response = service.files().get_media(fileId=file["id"]).execute() return pdf_to_text(file=io.BytesIO(response)) + elif mime_type == GDriveMimeType.POWERPOINT.value: + response = service.files().get_media(fileId=file["id"]).execute() + return pptx_to_text(file=io.BytesIO(response)) return UNSUPPORTED_FILE_TYPE_CONTENT