mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-29 11:12:02 +01:00
Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284)
This commit is contained in:
parent
c6b45a550f
commit
d729066194
@ -1,11 +1,19 @@
|
||||
import csv # type: ignore
|
||||
import io
|
||||
import os
|
||||
import zipfile
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from email.parser import Parser as EmailParser
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
import docx2txt # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
@ -46,7 +54,18 @@ def _read_files_and_metadata(
|
||||
file_content, ignore_dirs=True
|
||||
):
|
||||
yield os.path.join(directory_path, file_info.filename), file, metadata
|
||||
elif extension in [".txt", ".md", ".mdx", ".pdf"]:
|
||||
elif extension in [
|
||||
".txt",
|
||||
".md",
|
||||
".mdx",
|
||||
".pdf",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".csv",
|
||||
".eml",
|
||||
".epub",
|
||||
]:
|
||||
yield file_name, file_content, metadata
|
||||
else:
|
||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||
@ -69,6 +88,58 @@ def _process_file(
|
||||
file_content_raw = read_pdf_file(
|
||||
file=file, file_name=file_name, pdf_pass=pdf_pass
|
||||
)
|
||||
|
||||
elif extension == ".docx":
|
||||
file_content_raw = docx2txt.process(file)
|
||||
|
||||
elif extension == ".pptx":
|
||||
presentation = pptx.Presentation(file)
|
||||
text_content = []
|
||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||
extracted_text = f"\nSlide {slide_number}:\n"
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
extracted_text += shape.text + "\n"
|
||||
|
||||
text_content.append(extracted_text)
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".xlsx":
|
||||
workbook = openpyxl.load_workbook(file)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".csv":
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
reader = csv.reader(text_file)
|
||||
file_content_raw = "\n".join([",".join(row) for row in reader])
|
||||
|
||||
elif extension == ".eml":
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
parser = EmailParser()
|
||||
message = parser.parse(text_file)
|
||||
|
||||
text_content = []
|
||||
for part in message.walk():
|
||||
if part.get_content_type().startswith("text/plain"):
|
||||
text_content.append(part.get_payload())
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
|
||||
elif extension == ".epub":
|
||||
with zipfile.ZipFile(file) as epub:
|
||||
text_content = []
|
||||
for item in epub.infolist():
|
||||
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
||||
with epub.open(item) as html_file:
|
||||
soup = BeautifulSoup(html_file, "html.parser")
|
||||
text_content.append(soup.get_text())
|
||||
file_content_raw = "\n\n".join(text_content)
|
||||
else:
|
||||
encoding = detect_encoding(file)
|
||||
file_content_raw, file_metadata = read_file(file, encoding=encoding)
|
||||
|
@ -8,7 +8,19 @@ from typing import IO
|
||||
|
||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||
|
||||
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
|
||||
_VALID_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".zip",
|
||||
".pdf",
|
||||
".md",
|
||||
".mdx",
|
||||
".docx",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".csv",
|
||||
".eml",
|
||||
".epub",
|
||||
]
|
||||
|
||||
|
||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||
|
@ -52,9 +52,10 @@ const Main = () => {
|
||||
{filesAreUploading && <Spinner />}
|
||||
<Text className="mb-2">
|
||||
Specify files below, click the <b>Upload</b> button, and the contents of
|
||||
these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
|
||||
<i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
|
||||
are supported.
|
||||
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
|
||||
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
|
||||
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
|
||||
file types) are supported.
|
||||
</Text>
|
||||
<Text className="mb-3">
|
||||
<b>NOTE:</b> if the original document is accessible via a link, you can
|
||||
|
Loading…
x
Reference in New Issue
Block a user