Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284)

2025-09-28 04:49:21 +02:00 · 2024-05-10 19:06:13 -07:00
parent c6b45a550f
commit d729066194
3 changed files with 89 additions and 5 deletions
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,11 +1,19 @@
 import csv  # type: ignore
 import io
 import os
 import zipfile
 from collections.abc import Iterator
 from datetime import datetime
 from datetime import timezone
 from email.parser import Parser as EmailParser
 from pathlib import Path
 from typing import Any
 from typing import IO
 import docx2txt  # type: ignore
 import openpyxl  # type: ignore
 import pptx  # type: ignore
 from bs4 import BeautifulSoup
 from sqlalchemy.orm import Session
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
@@ -46,7 +54,18 @@ def _read_files_and_metadata(
            file_content, ignore_dirs=True
        ):
            yield os.path.join(directory_path, file_info.filename), file, metadata
-    elif extension in [".txt", ".md", ".mdx", ".pdf"]:
+    elif extension in [
        ".txt",
        ".md",
        ".mdx",
        ".pdf",
        ".docx",
        ".pptx",
        ".xlsx",
        ".csv",
        ".eml",
        ".epub",
    ]:
        yield file_name, file_content, metadata
    else:
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@@ -69,6 +88,58 @@ def _process_file(
        file_content_raw = read_pdf_file(
            file=file, file_name=file_name, pdf_pass=pdf_pass
        )
    elif extension == ".docx":
        file_content_raw = docx2txt.process(file)
    elif extension == ".pptx":
        presentation = pptx.Presentation(file)
        text_content = []
        for slide_number, slide in enumerate(presentation.slides, start=1):
            extracted_text = f"\nSlide {slide_number}:\n"
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    extracted_text += shape.text + "\n"
            text_content.append(extracted_text)
        file_content_raw = "\n\n".join(text_content)
    elif extension == ".xlsx":
        workbook = openpyxl.load_workbook(file)
        text_content = []
        for sheet in workbook.worksheets:
            sheet_string = "\n".join(
                ",".join(map(str, row))
                for row in sheet.iter_rows(min_row=1, values_only=True)
            )
            text_content.append(sheet_string)
        file_content_raw = "\n\n".join(text_content)
    elif extension == ".csv":
        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
        reader = csv.reader(text_file)
        file_content_raw = "\n".join([",".join(row) for row in reader])
    elif extension == ".eml":
        text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
        parser = EmailParser()
        message = parser.parse(text_file)
        text_content = []
        for part in message.walk():
            if part.get_content_type().startswith("text/plain"):
                text_content.append(part.get_payload())
        file_content_raw = "\n\n".join(text_content)
    elif extension == ".epub":
        with zipfile.ZipFile(file) as epub:
            text_content = []
            for item in epub.infolist():
                if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
                    with epub.open(item) as html_file:
                        soup = BeautifulSoup(html_file, "html.parser")
                        text_content.append(soup.get_text())
            file_content_raw = "\n\n".join(text_content)
    else:
        encoding = detect_encoding(file)
        file_content_raw, file_metadata = read_file(file, encoding=encoding)
--- a/backend/danswer/connectors/file/utils.py
+++ b/backend/danswer/connectors/file/utils.py
@@ -8,7 +8,19 @@ from typing import IO
 from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
-_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
+_VALID_FILE_EXTENSIONS = [
    ".txt",
    ".zip",
    ".pdf",
    ".md",
    ".mdx",
    ".docx",
    ".pptx",
    ".xlsx",
    ".csv",
    ".eml",
    ".epub",
 ]
 def get_file_ext(file_path_or_name: str | Path) -> str:
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@@ -52,9 +52,10 @@ const Main = () => {
      {filesAreUploading && <Spinner />}
      <Text className="mb-2">
        Specify files below, click the <b>Upload</b> button, and the contents of
-        these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
+        these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
-        <i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
+        <i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
-        are supported.
+        <i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
        file types) are supported.
      </Text>
      <Text className="mb-3">
        <b>NOTE:</b> if the original document is accessible via a link, you can