Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284)

2025-03-29 11:12:02 +01:00 · 2024-05-10 19:06:13 -07:00 · 2024-05-10 19:06:13 -07:00 · d729066194
commit d729066194
parent c6b45a550f
3 changed files with 89 additions and 5 deletions
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@ -1,11 +1,19 @@
+import csv  # type: ignore
+import io
 import os
+import zipfile
 from collections.abc import Iterator
 from datetime import datetime
 from datetime import timezone
+from email.parser import Parser as EmailParser
 from pathlib import Path
 from typing import Any
 from typing import IO

+import docx2txt  # type: ignore
+import openpyxl  # type: ignore
+import pptx  # type: ignore
+from bs4 import BeautifulSoup
 from sqlalchemy.orm import Session

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
@ -46,7 +54,18 @@ def _read_files_and_metadata(
            file_content, ignore_dirs=True
        ):
            yield os.path.join(directory_path, file_info.filename), file, metadata
-    elif extension in [".txt", ".md", ".mdx", ".pdf"]:
+    elif extension in [
+        ".txt",
+        ".md",
+        ".mdx",
+        ".pdf",
+        ".docx",
+        ".pptx",
+        ".xlsx",
+        ".csv",
+        ".eml",
+        ".epub",
+    ]:
        yield file_name, file_content, metadata
    else:
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@ -69,6 +88,58 @@ def _process_file(
        file_content_raw = read_pdf_file(
            file=file, file_name=file_name, pdf_pass=pdf_pass
        )
+
+    elif extension == ".docx":
+        file_content_raw = docx2txt.process(file)
+
+    elif extension == ".pptx":
+        presentation = pptx.Presentation(file)
+        text_content = []
+        for slide_number, slide in enumerate(presentation.slides, start=1):
+            extracted_text = f"\nSlide {slide_number}:\n"
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    extracted_text += shape.text + "\n"
+
+            text_content.append(extracted_text)
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".xlsx":
+        workbook = openpyxl.load_workbook(file)
+        text_content = []
+        for sheet in workbook.worksheets:
+            sheet_string = "\n".join(
+                ",".join(map(str, row))
+                for row in sheet.iter_rows(min_row=1, values_only=True)
+            )
+            text_content.append(sheet_string)
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".csv":
+        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
+        reader = csv.reader(text_file)
+        file_content_raw = "\n".join([",".join(row) for row in reader])
+
+    elif extension == ".eml":
+        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
+        parser = EmailParser()
+        message = parser.parse(text_file)
+
+        text_content = []
+        for part in message.walk():
+            if part.get_content_type().startswith("text/plain"):
+                text_content.append(part.get_payload())
+        file_content_raw = "\n\n".join(text_content)
+
+    elif extension == ".epub":
+        with zipfile.ZipFile(file) as epub:
+            text_content = []
+            for item in epub.infolist():
+                if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
+                    with epub.open(item) as html_file:
+                        soup = BeautifulSoup(html_file, "html.parser")
+                        text_content.append(soup.get_text())
+            file_content_raw = "\n\n".join(text_content)
    else:
        encoding = detect_encoding(file)
        file_content_raw, file_metadata = read_file(file, encoding=encoding)
--- a/backend/danswer/connectors/file/utils.py
+++ b/backend/danswer/connectors/file/utils.py
@ -8,7 +8,19 @@ from typing import IO

 from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH

-_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
+_VALID_FILE_EXTENSIONS = [
+    ".txt",
+    ".zip",
+    ".pdf",
+    ".md",
+    ".mdx",
+    ".docx",
+    ".pptx",
+    ".xlsx",
+    ".csv",
+    ".eml",
+    ".epub",
+]


 def get_file_ext(file_path_or_name: str | Path) -> str:
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@ -52,9 +52,10 @@ const Main = () => {
      {filesAreUploading && <Spinner />}
      <Text className="mb-2">
        Specify files below, click the <b>Upload</b> button, and the contents of
-        these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
-        <i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
-        are supported.
+        these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
+        <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
+        <i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
+        file types) are supported.
      </Text>
      <Text className="mb-3">
        <b>NOTE:</b> if the original document is accessible via a link, you can