From d729066194662f8cecdf5e17a23b08790fa663aa Mon Sep 17 00:00:00 2001
From: Matthew Holland <mlholland@ucsd.edu>
Date: Fri, 10 May 2024 19:06:13 -0700
Subject: [PATCH] Feature: Added File connector support for .docx, .pptx,
 .xlsx, .csv, .eml, and .epub file types (#1284)

---
 backend/danswer/connectors/file/connector.py | 73 +++++++++++++++++++-
 backend/danswer/connectors/file/utils.py     | 14 +++-
 web/src/app/admin/connectors/file/page.tsx   |  7 +-
 3 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py
index 1564948567..37a9163821 100644
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,11 +1,19 @@
+import csv  # type: ignore
+import io
 import os
+import zipfile
 from collections.abc import Iterator
 from datetime import datetime
 from datetime import timezone
+from email.parser import Parser as EmailParser
 from pathlib import Path
 from typing import Any
 from typing import IO
 
+import docx2txt  # type: ignore
+import openpyxl  # type: ignore
+import pptx  # type: ignore
+from bs4 import BeautifulSoup
 from sqlalchemy.orm import Session
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
@@ -46,7 +54,18 @@ def _read_files_and_metadata(
             file_content, ignore_dirs=True
         ):
             yield os.path.join(directory_path, file_info.filename), file, metadata
-    elif extension in [".txt", ".md", ".mdx", ".pdf"]:
+    elif extension in [
+        ".txt",
+        ".md",
+        ".mdx",
+        ".pdf",
+        ".docx",
+        ".pptx",
+        ".xlsx",
+        ".csv",
+        ".eml",
+        ".epub",
+    ]:
         yield file_name, file_content, metadata
     else:
         logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@@ -69,6 +88,58 @@ def _process_file(
         file_content_raw = read_pdf_file(
             file=file, file_name=file_name, pdf_pass=pdf_pass
         )
+
+    elif extension == ".docx":
+        file_content_raw = docx2txt.process(file)
+
+    elif extension == ".pptx":
+        presentation = pptx.Presentation(file)
+        text_content = []
+        for slide_number, slide in enumerate(presentation.slides, start=1):
+            extracted_text = f"\nSlide {slide_number}:\n"
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    extracted_text += shape.text + "\n"
+
+            text_content.append(extracted_text)
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".xlsx":
+        workbook = openpyxl.load_workbook(file)
+        text_content = []
+        for sheet in workbook.worksheets:
+            sheet_string = "\n".join(
+                ",".join(map(str, row))
+                for row in sheet.iter_rows(min_row=1, values_only=True)
+            )
+            text_content.append(sheet_string)
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".csv":
+        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
+        reader = csv.reader(text_file)
+        file_content_raw = "\n".join([",".join(row) for row in reader])
+
+    elif extension == ".eml":
+        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
+        parser = EmailParser()
+        message = parser.parse(text_file)
+
+        text_content = []
+        for part in message.walk():
+            if part.get_content_type().startswith("text/plain"):
+                text_content.append(part.get_payload())
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".epub":
+        with zipfile.ZipFile(file) as epub:
+            text_content = []
+            for item in epub.infolist():
+                if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
+                    with epub.open(item) as html_file:
+                        soup = BeautifulSoup(html_file, "html.parser")
+                        text_content.append(soup.get_text())
+            file_content_raw = "\n\n".join(text_content)
     else:
         encoding = detect_encoding(file)
         file_content_raw, file_metadata = read_file(file, encoding=encoding)
diff --git a/backend/danswer/connectors/file/utils.py b/backend/danswer/connectors/file/utils.py
index cb1f26f638..e5f6d61a91 100644
--- a/backend/danswer/connectors/file/utils.py
+++ b/backend/danswer/connectors/file/utils.py
@@ -8,7 +8,19 @@ from typing import IO
 
 from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
 
-_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
+_VALID_FILE_EXTENSIONS = [
+    ".txt",
+    ".zip",
+    ".pdf",
+    ".md",
+    ".mdx",
+    ".docx",
+    ".pptx",
+    ".xlsx",
+    ".csv",
+    ".eml",
+    ".epub",
+]
 
 
 def get_file_ext(file_path_or_name: str | Path) -> str:
diff --git a/web/src/app/admin/connectors/file/page.tsx b/web/src/app/admin/connectors/file/page.tsx
index b963b2d94d..a92ff8a16f 100644
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@@ -52,9 +52,10 @@ const Main = () => {
       {filesAreUploading && <Spinner />}
       <Text className="mb-2">
         Specify files below, click the <b>Upload</b> button, and the contents of
-        these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
-        <i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
-        are supported.
+        these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
+        <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
+        <i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
+        file types) are supported.
       </Text>
       <Text className="mb-3">
         <b>NOTE:</b> if the original document is accessible via a link, you can