mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-28 04:49:21 +02:00
Feature: Added File connector support for .docx, .pptx, .xlsx, .csv, .eml, and .epub file types (#1284)
This commit is contained in:
@@ -1,11 +1,19 @@
|
|||||||
|
import csv # type: ignore
|
||||||
|
import io
|
||||||
import os
|
import os
|
||||||
|
import zipfile
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
|
from email.parser import Parser as EmailParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
|
import docx2txt # type: ignore
|
||||||
|
import openpyxl # type: ignore
|
||||||
|
import pptx # type: ignore
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
@@ -46,7 +54,18 @@ def _read_files_and_metadata(
|
|||||||
file_content, ignore_dirs=True
|
file_content, ignore_dirs=True
|
||||||
):
|
):
|
||||||
yield os.path.join(directory_path, file_info.filename), file, metadata
|
yield os.path.join(directory_path, file_info.filename), file, metadata
|
||||||
elif extension in [".txt", ".md", ".mdx", ".pdf"]:
|
elif extension in [
|
||||||
|
".txt",
|
||||||
|
".md",
|
||||||
|
".mdx",
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".csv",
|
||||||
|
".eml",
|
||||||
|
".epub",
|
||||||
|
]:
|
||||||
yield file_name, file_content, metadata
|
yield file_name, file_content, metadata
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||||
@@ -69,6 +88,58 @@ def _process_file(
|
|||||||
file_content_raw = read_pdf_file(
|
file_content_raw = read_pdf_file(
|
||||||
file=file, file_name=file_name, pdf_pass=pdf_pass
|
file=file, file_name=file_name, pdf_pass=pdf_pass
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif extension == ".docx":
|
||||||
|
file_content_raw = docx2txt.process(file)
|
||||||
|
|
||||||
|
elif extension == ".pptx":
|
||||||
|
presentation = pptx.Presentation(file)
|
||||||
|
text_content = []
|
||||||
|
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||||
|
extracted_text = f"\nSlide {slide_number}:\n"
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text"):
|
||||||
|
extracted_text += shape.text + "\n"
|
||||||
|
|
||||||
|
text_content.append(extracted_text)
|
||||||
|
file_content_raw = "\n\n".join(text_content)
|
||||||
|
|
||||||
|
elif extension == ".xlsx":
|
||||||
|
workbook = openpyxl.load_workbook(file)
|
||||||
|
text_content = []
|
||||||
|
for sheet in workbook.worksheets:
|
||||||
|
sheet_string = "\n".join(
|
||||||
|
",".join(map(str, row))
|
||||||
|
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||||
|
)
|
||||||
|
text_content.append(sheet_string)
|
||||||
|
file_content_raw = "\n\n".join(text_content)
|
||||||
|
|
||||||
|
elif extension == ".csv":
|
||||||
|
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||||
|
reader = csv.reader(text_file)
|
||||||
|
file_content_raw = "\n".join([",".join(row) for row in reader])
|
||||||
|
|
||||||
|
elif extension == ".eml":
|
||||||
|
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||||
|
parser = EmailParser()
|
||||||
|
message = parser.parse(text_file)
|
||||||
|
|
||||||
|
text_content = []
|
||||||
|
for part in message.walk():
|
||||||
|
if part.get_content_type().startswith("text/plain"):
|
||||||
|
text_content.append(part.get_payload())
|
||||||
|
file_content_raw = "\n\n".join(text_content)
|
||||||
|
|
||||||
|
elif extension == ".epub":
|
||||||
|
with zipfile.ZipFile(file) as epub:
|
||||||
|
text_content = []
|
||||||
|
for item in epub.infolist():
|
||||||
|
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
||||||
|
with epub.open(item) as html_file:
|
||||||
|
soup = BeautifulSoup(html_file, "html.parser")
|
||||||
|
text_content.append(soup.get_text())
|
||||||
|
file_content_raw = "\n\n".join(text_content)
|
||||||
else:
|
else:
|
||||||
encoding = detect_encoding(file)
|
encoding = detect_encoding(file)
|
||||||
file_content_raw, file_metadata = read_file(file, encoding=encoding)
|
file_content_raw, file_metadata = read_file(file, encoding=encoding)
|
||||||
|
@@ -8,7 +8,19 @@ from typing import IO
|
|||||||
|
|
||||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||||
|
|
||||||
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf", ".md", ".mdx"]
|
_VALID_FILE_EXTENSIONS = [
|
||||||
|
".txt",
|
||||||
|
".zip",
|
||||||
|
".pdf",
|
||||||
|
".md",
|
||||||
|
".mdx",
|
||||||
|
".docx",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".csv",
|
||||||
|
".eml",
|
||||||
|
".epub",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||||
|
@@ -52,9 +52,10 @@ const Main = () => {
|
|||||||
{filesAreUploading && <Spinner />}
|
{filesAreUploading && <Spinner />}
|
||||||
<Text className="mb-2">
|
<Text className="mb-2">
|
||||||
Specify files below, click the <b>Upload</b> button, and the contents of
|
Specify files below, click the <b>Upload</b> button, and the contents of
|
||||||
these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
|
these files will be searchable via Danswer! Currently <i>.txt</i>,{" "}
|
||||||
<i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
|
<i>.pdf</i>, <i>.docx</i>, <i>.pptx</i>, <i>.xlxs</i>, <i>.csv</i>,{" "}
|
||||||
are supported.
|
<i>.eml</i>, <i>.epub</i>, and <i>.zip</i> files (containing supported
|
||||||
|
file types) are supported.
|
||||||
</Text>
|
</Text>
|
||||||
<Text className="mb-3">
|
<Text className="mb-3">
|
||||||
<b>NOTE:</b> if the original document is accessible via a link, you can
|
<b>NOTE:</b> if the original document is accessible via a link, you can
|
||||||
|
Reference in New Issue
Block a user