From d729066194662f8cecdf5e17a23b08790fa663aa Mon Sep 17 00:00:00 2001 From: Matthew Holland Date: Fri, 10 May 2024 19:06:13 -0700 Subject: [PATCH] Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284) --- backend/danswer/connectors/file/connector.py | 73 +++++++++++++++++++- backend/danswer/connectors/file/utils.py | 14 +++- web/src/app/admin/connectors/file/page.tsx | 7 +- 3 files changed, 89 insertions(+), 5 deletions(-) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 156494856..37a916382 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -1,11 +1,19 @@ +import csv # type: ignore +import io import os +import zipfile from collections.abc import Iterator from datetime import datetime from datetime import timezone +from email.parser import Parser as EmailParser from pathlib import Path from typing import Any from typing import IO +import docx2txt # type: ignore +import openpyxl # type: ignore +import pptx # type: ignore +from bs4 import BeautifulSoup from sqlalchemy.orm import Session from danswer.configs.app_configs import INDEX_BATCH_SIZE @@ -46,7 +54,18 @@ def _read_files_and_metadata( file_content, ignore_dirs=True ): yield os.path.join(directory_path, file_info.filename), file, metadata - elif extension in [".txt", ".md", ".mdx", ".pdf"]: + elif extension in [ + ".txt", + ".md", + ".mdx", + ".pdf", + ".docx", + ".pptx", + ".xlsx", + ".csv", + ".eml", + ".epub", + ]: yield file_name, file_content, metadata else: logger.warning(f"Skipping file '{file_name}' with extension '{extension}'") @@ -69,6 +88,58 @@ def _process_file( file_content_raw = read_pdf_file( file=file, file_name=file_name, pdf_pass=pdf_pass ) + + elif extension == ".docx": + file_content_raw = docx2txt.process(file) + + elif extension == ".pptx": + presentation = pptx.Presentation(file) + text_content = [] + for slide_number, slide in enumerate(presentation.slides, start=1): + extracted_text = f"\nSlide {slide_number}:\n" + for shape in slide.shapes: + if hasattr(shape, "text"): + extracted_text += shape.text + "\n" + + text_content.append(extracted_text) + file_content_raw = "\n\n".join(text_content) + + elif extension == ".xlsx": + workbook = openpyxl.load_workbook(file) + text_content = [] + for sheet in workbook.worksheets: + sheet_string = "\n".join( + ",".join(map(str, row)) + for row in sheet.iter_rows(min_row=1, values_only=True) + ) + text_content.append(sheet_string) + file_content_raw = "\n\n".join(text_content) + + elif extension == ".csv": + text_file = io.TextIOWrapper(file, encoding=detect_encoding(file)) + reader = csv.reader(text_file) + file_content_raw = "\n".join([",".join(row) for row in reader]) + + elif extension == ".eml": + text_file = io.TextIOWrapper(file, encoding=detect_encoding(file)) + parser = EmailParser() + message = parser.parse(text_file) + + text_content = [] + for part in message.walk(): + if part.get_content_type().startswith("text/plain"): + text_content.append(part.get_payload()) + file_content_raw = "\n\n".join(text_content) + + elif extension == ".epub": + with zipfile.ZipFile(file) as epub: + text_content = [] + for item in epub.infolist(): + if item.filename.endswith(".xhtml") or item.filename.endswith(".html"): + with epub.open(item) as html_file: + soup = BeautifulSoup(html_file, "html.parser") + text_content.append(soup.get_text()) + file_content_raw = "\n\n".join(text_content) else: encoding = detect_encoding(file) file_content_raw, file_metadata = read_file(file, encoding=encoding) diff --git a/backend/danswer/connectors/file/utils.py b/backend/danswer/connectors/file/utils.py index cb1f26f63..e5f6d61a9 100644 --- a/backend/danswer/connectors/file/utils.py +++ b/backend/danswer/connectors/file/utils.py @@ -8,7 +8,19 @@ from typing import IO from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH -_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"] +_VALID_FILE_EXTENSIONS = [ + ".txt", + ".zip", + ".pdf", + ".md", + ".mdx", + ".docx", + ".pptx", + ".xlsx", + ".csv", + ".eml", + ".epub", +] def get_file_ext(file_path_or_name: str | Path) -> str: diff --git a/web/src/app/admin/connectors/file/page.tsx b/web/src/app/admin/connectors/file/page.tsx index b963b2d94..a92ff8a16 100644 --- a/web/src/app/admin/connectors/file/page.tsx +++ b/web/src/app/admin/connectors/file/page.tsx @@ -52,9 +52,10 @@ const Main = () => { {filesAreUploading && } Specify files below, click the Upload button, and the contents of - these files will be searchable via Danswer! Currently only .txt,{" "} - .pdf and .zip files (containing only .txt files) - are supported. + these files will be searchable via Danswer! Currently .txt,{" "} + .pdf, .docx, .pptx, .xlxs, .csv,{" "} + .eml, .epub, and .zip files (containing supported + file types) are supported. NOTE: if the original document is accessible via a link, you can