Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284)

This commit is contained in:
Matthew Holland 2024-05-10 19:06:13 -07:00 committed by GitHub
parent c6b45a550f
commit d729066194
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 89 additions and 5 deletions

View File

@ -1,11 +1,19 @@
import csv # type: ignore
import io
import os
import zipfile
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from email.parser import Parser as EmailParser
from pathlib import Path
from typing import Any
from typing import IO
import docx2txt # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from danswer.configs.app_configs import INDEX_BATCH_SIZE
@ -46,7 +54,18 @@ def _read_files_and_metadata(
file_content, ignore_dirs=True
):
yield os.path.join(directory_path, file_info.filename), file, metadata
elif extension in [".txt", ".md", ".mdx", ".pdf"]:
elif extension in [
".txt",
".md",
".mdx",
".pdf",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]:
yield file_name, file_content, metadata
else:
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@ -69,6 +88,58 @@ def _process_file(
file_content_raw = read_pdf_file(
file=file, file_name=file_name, pdf_pass=pdf_pass
)
elif extension == ".docx":
file_content_raw = docx2txt.process(file)
elif extension == ".pptx":
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
file_content_raw = "\n\n".join(text_content)
elif extension == ".xlsx":
workbook = openpyxl.load_workbook(file)
text_content = []
for sheet in workbook.worksheets:
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
file_content_raw = "\n\n".join(text_content)
elif extension == ".csv":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
reader = csv.reader(text_file)
file_content_raw = "\n".join([",".join(row) for row in reader])
elif extension == ".eml":
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
text_content.append(part.get_payload())
file_content_raw = "\n\n".join(text_content)
elif extension == ".epub":
with zipfile.ZipFile(file) as epub:
text_content = []
for item in epub.infolist():
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
with epub.open(item) as html_file:
soup = BeautifulSoup(html_file, "html.parser")
text_content.append(soup.get_text())
file_content_raw = "\n\n".join(text_content)
else:
encoding = detect_encoding(file)
file_content_raw, file_metadata = read_file(file, encoding=encoding)

View File

@ -8,7 +8,19 @@ from typing import IO
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
_VALID_FILE_EXTENSIONS = [
".txt",
".zip",
".pdf",
".md",
".mdx",
".docx",
".pptx",
".xlsx",
".csv",
".eml",
".epub",
]
def get_file_ext(file_path_or_name: str | Path) -> str:

View File

@ -52,9 +52,10 @@ const Main = () => {
{filesAreUploading && <Spinner />}
<Text className="mb-2">
Specify files below, click the <b>Upload</b> button, and the contents of
these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
<i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
are supported.
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
file types) are supported.
</Text>
<Text className="mb-3">
<b>NOTE:</b> if the original document is accessible via a link, you can