added pptx to drive reader (#1634)

This commit is contained in:
hagen-danswer 2024-06-14 01:50:28 -04:00 committed by GitHub
parent ff06d62acf
commit 4b0ff95b26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -42,6 +42,7 @@ from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
@ -57,6 +58,9 @@ class GDriveMimeType(str, Enum):
SPREADSHEET = "application/vnd.google-apps.spreadsheet"
PDF = "application/pdf"
WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
POWERPOINT = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
GoogleDriveFileType = dict[str, Any]
@ -325,6 +329,9 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
return pdf_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.POWERPOINT.value:
response = service.files().get_media(fileId=file["id"]).execute()
return pptx_to_text(file=io.BytesIO(response))
return UNSUPPORTED_FILE_TYPE_CONTENT