mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-04 00:40:44 +02:00
423 lines
13 KiB
Python
423 lines
13 KiB
Python
import io
|
|
import json
|
|
import os
|
|
import re
|
|
import zipfile
|
|
from collections.abc import Callable
|
|
from collections.abc import Iterator
|
|
from email.parser import Parser as EmailParser
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from typing import Dict
|
|
from typing import IO
|
|
|
|
import chardet
|
|
import docx # type: ignore
|
|
import openpyxl # type: ignore
|
|
import pptx # type: ignore
|
|
from docx import Document
|
|
from fastapi import UploadFile
|
|
from pypdf import PdfReader
|
|
from pypdf.errors import PdfStreamError
|
|
|
|
from onyx.configs.constants import DANSWER_METADATA_FILENAME
|
|
from onyx.configs.constants import FileOrigin
|
|
from onyx.file_processing.html_utils import parse_html_page_basic
|
|
from onyx.file_processing.unstructured import get_unstructured_api_key
|
|
from onyx.file_processing.unstructured import unstructured_to_text
|
|
from onyx.file_store.file_store import FileStore
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
TEXT_SECTION_SEPARATOR = "\n\n"
|
|
|
|
|
|
PLAIN_TEXT_FILE_EXTENSIONS = [
|
|
".txt",
|
|
".md",
|
|
".mdx",
|
|
".conf",
|
|
".log",
|
|
".json",
|
|
".csv",
|
|
".tsv",
|
|
".xml",
|
|
".yml",
|
|
".yaml",
|
|
]
|
|
|
|
|
|
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
|
".pdf",
|
|
".docx",
|
|
".pptx",
|
|
".xlsx",
|
|
".eml",
|
|
".epub",
|
|
".html",
|
|
]
|
|
|
|
|
|
def is_text_file_extension(file_name: str) -> bool:
|
|
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
|
|
|
|
|
|
def get_file_ext(file_path_or_name: str | Path) -> str:
|
|
_, extension = os.path.splitext(file_path_or_name)
|
|
# standardize all extensions to be lowercase so that checks against
|
|
# VALID_FILE_EXTENSIONS and similar will work as intended
|
|
return extension.lower()
|
|
|
|
|
|
def is_valid_file_ext(ext: str) -> bool:
|
|
return ext in VALID_FILE_EXTENSIONS
|
|
|
|
|
|
def is_text_file(file: IO[bytes]) -> bool:
|
|
"""
|
|
checks if the first 1024 bytes only contain printable or whitespace characters
|
|
if it does, then we say its a plaintext file
|
|
"""
|
|
raw_data = file.read(1024)
|
|
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
|
|
return all(c in text_chars for c in raw_data)
|
|
|
|
|
|
def detect_encoding(file: IO[bytes]) -> str:
|
|
raw_data = file.read(50000)
|
|
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
|
file.seek(0)
|
|
return encoding
|
|
|
|
|
|
def is_macos_resource_fork_file(file_name: str) -> bool:
|
|
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
|
"__MACOSX"
|
|
)
|
|
|
|
|
|
# To include additional metadata in the search index, add a .onyx_metadata.json file
|
|
# to the zip file. This file should contain a list of objects with the following format:
|
|
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
|
def load_files_from_zip(
|
|
zip_file_io: IO,
|
|
ignore_macos_resource_fork_files: bool = True,
|
|
ignore_dirs: bool = True,
|
|
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
|
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
|
zip_metadata = {}
|
|
try:
|
|
metadata_file_info = zip_file.getinfo(DANSWER_METADATA_FILENAME)
|
|
with zip_file.open(metadata_file_info, "r") as metadata_file:
|
|
try:
|
|
zip_metadata = json.load(metadata_file)
|
|
if isinstance(zip_metadata, list):
|
|
# convert list of dicts to dict of dicts
|
|
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
|
except json.JSONDecodeError:
|
|
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
|
|
except KeyError:
|
|
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
|
|
|
|
for file_info in zip_file.infolist():
|
|
with zip_file.open(file_info.filename, "r") as file:
|
|
if ignore_dirs and file_info.is_dir():
|
|
continue
|
|
|
|
if (
|
|
ignore_macos_resource_fork_files
|
|
and is_macos_resource_fork_file(file_info.filename)
|
|
) or file_info.filename == DANSWER_METADATA_FILENAME:
|
|
continue
|
|
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
|
|
|
|
|
def _extract_onyx_metadata(line: str) -> dict | None:
|
|
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
|
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
|
|
|
html_comment_match = re.search(html_comment_pattern, line)
|
|
hashtag_match = re.search(hashtag_pattern, line)
|
|
|
|
if html_comment_match:
|
|
json_str = html_comment_match.group(1)
|
|
elif hashtag_match:
|
|
json_str = hashtag_match.group(1)
|
|
else:
|
|
return None
|
|
|
|
try:
|
|
return json.loads("{" + json_str + "}")
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
|
|
def read_text_file(
|
|
file: IO,
|
|
encoding: str = "utf-8",
|
|
errors: str = "replace",
|
|
ignore_onyx_metadata: bool = True,
|
|
) -> tuple[str, dict]:
|
|
metadata = {}
|
|
file_content_raw = ""
|
|
for ind, line in enumerate(file):
|
|
try:
|
|
line = line.decode(encoding) if isinstance(line, bytes) else line
|
|
except UnicodeDecodeError:
|
|
line = (
|
|
line.decode(encoding, errors=errors)
|
|
if isinstance(line, bytes)
|
|
else line
|
|
)
|
|
|
|
if ind == 0:
|
|
metadata_or_none = (
|
|
None if ignore_onyx_metadata else _extract_onyx_metadata(line)
|
|
)
|
|
if metadata_or_none is not None:
|
|
metadata = metadata_or_none
|
|
else:
|
|
file_content_raw += line
|
|
else:
|
|
file_content_raw += line
|
|
|
|
return file_content_raw, metadata
|
|
|
|
|
|
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
|
"""Extract text from a PDF file."""
|
|
# Return only the extracted text from read_pdf_file
|
|
text, _ = read_pdf_file(file, pdf_pass)
|
|
return text
|
|
|
|
|
|
def read_pdf_file(
|
|
file: IO[Any],
|
|
pdf_pass: str | None = None,
|
|
) -> tuple[str, dict]:
|
|
metadata: Dict[str, Any] = {}
|
|
try:
|
|
pdf_reader = PdfReader(file)
|
|
|
|
# If marked as encrypted and a password is provided, try to decrypt
|
|
if pdf_reader.is_encrypted and pdf_pass is not None:
|
|
decrypt_success = False
|
|
if pdf_pass is not None:
|
|
try:
|
|
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
|
except Exception:
|
|
logger.error("Unable to decrypt pdf")
|
|
|
|
if not decrypt_success:
|
|
# By user request, keep files that are unreadable just so they
|
|
# can be discoverable by title.
|
|
return "", metadata
|
|
elif pdf_reader.is_encrypted:
|
|
logger.warning("No Password available to decrypt pdf, returning empty")
|
|
return "", metadata
|
|
|
|
# Extract metadata from the PDF, removing leading '/' from keys if present
|
|
# This standardizes the metadata keys for consistency
|
|
metadata = {}
|
|
if pdf_reader.metadata is not None:
|
|
for key, value in pdf_reader.metadata.items():
|
|
clean_key = key.lstrip("/")
|
|
if isinstance(value, str) and value.strip():
|
|
metadata[clean_key] = value
|
|
|
|
elif isinstance(value, list) and all(
|
|
isinstance(item, str) for item in value
|
|
):
|
|
metadata[clean_key] = ", ".join(value)
|
|
|
|
return (
|
|
TEXT_SECTION_SEPARATOR.join(
|
|
page.extract_text() for page in pdf_reader.pages
|
|
),
|
|
metadata,
|
|
)
|
|
except PdfStreamError:
|
|
logger.exception("PDF file is not a valid PDF")
|
|
except Exception:
|
|
logger.exception("Failed to read PDF")
|
|
|
|
# File is still discoverable by title
|
|
# but the contents are not included as they cannot be parsed
|
|
return "", metadata
|
|
|
|
|
|
def docx_to_text(file: IO[Any]) -> str:
|
|
def is_simple_table(table: docx.table.Table) -> bool:
|
|
for row in table.rows:
|
|
# No omitted cells
|
|
if row.grid_cols_before > 0 or row.grid_cols_after > 0:
|
|
return False
|
|
|
|
# No nested tables
|
|
if any(cell.tables for cell in row.cells):
|
|
return False
|
|
|
|
return True
|
|
|
|
def extract_cell_text(cell: docx.table._Cell) -> str:
|
|
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
|
|
return " ".join(p for p in cell_paragraphs if p) or "N/A"
|
|
|
|
paragraphs = []
|
|
doc = docx.Document(file)
|
|
for item in doc.iter_inner_content():
|
|
if isinstance(item, docx.text.paragraph.Paragraph):
|
|
paragraphs.append(item.text)
|
|
|
|
elif isinstance(item, docx.table.Table):
|
|
if not item.rows or not is_simple_table(item):
|
|
continue
|
|
|
|
# Every row is a new line, joined with a single newline
|
|
table_content = "\n".join(
|
|
[
|
|
",\t".join(extract_cell_text(cell) for cell in row.cells)
|
|
for row in item.rows
|
|
]
|
|
)
|
|
paragraphs.append(table_content)
|
|
|
|
# Docx already has good spacing between paragraphs
|
|
return "\n".join(paragraphs)
|
|
|
|
|
|
def pptx_to_text(file: IO[Any]) -> str:
|
|
presentation = pptx.Presentation(file)
|
|
text_content = []
|
|
for slide_number, slide in enumerate(presentation.slides, start=1):
|
|
extracted_text = f"\nSlide {slide_number}:\n"
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text"):
|
|
extracted_text += shape.text + "\n"
|
|
text_content.append(extracted_text)
|
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
|
|
|
|
|
def xlsx_to_text(file: IO[Any]) -> str:
|
|
workbook = openpyxl.load_workbook(file, read_only=True)
|
|
text_content = []
|
|
for sheet in workbook.worksheets:
|
|
sheet_string = "\n".join(
|
|
",".join(map(str, row))
|
|
for row in sheet.iter_rows(min_row=1, values_only=True)
|
|
)
|
|
text_content.append(sheet_string)
|
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
|
|
|
|
|
def eml_to_text(file: IO[Any]) -> str:
|
|
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
|
parser = EmailParser()
|
|
message = parser.parse(text_file)
|
|
text_content = []
|
|
for part in message.walk():
|
|
if part.get_content_type().startswith("text/plain"):
|
|
text_content.append(part.get_payload())
|
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
|
|
|
|
|
def epub_to_text(file: IO[Any]) -> str:
|
|
with zipfile.ZipFile(file) as epub:
|
|
text_content = []
|
|
for item in epub.infolist():
|
|
if item.filename.endswith(".xhtml") or item.filename.endswith(".html"):
|
|
with epub.open(item) as html_file:
|
|
text_content.append(parse_html_page_basic(html_file))
|
|
return TEXT_SECTION_SEPARATOR.join(text_content)
|
|
|
|
|
|
def file_io_to_text(file: IO[Any]) -> str:
|
|
encoding = detect_encoding(file)
|
|
file_content_raw, _ = read_text_file(file, encoding=encoding)
|
|
return file_content_raw
|
|
|
|
|
|
def extract_file_text(
|
|
file: IO[Any],
|
|
file_name: str,
|
|
break_on_unprocessable: bool = True,
|
|
extension: str | None = None,
|
|
) -> str:
|
|
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
|
".pdf": pdf_to_text,
|
|
".docx": docx_to_text,
|
|
".pptx": pptx_to_text,
|
|
".xlsx": xlsx_to_text,
|
|
".eml": eml_to_text,
|
|
".epub": epub_to_text,
|
|
".html": parse_html_page_basic,
|
|
}
|
|
|
|
try:
|
|
if get_unstructured_api_key():
|
|
try:
|
|
return unstructured_to_text(file, file_name)
|
|
except Exception as unstructured_error:
|
|
logger.error(
|
|
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
|
|
)
|
|
# Fall through to normal processing
|
|
|
|
if file_name or extension:
|
|
if extension is not None:
|
|
final_extension = extension
|
|
elif file_name is not None:
|
|
final_extension = get_file_ext(file_name)
|
|
|
|
if is_valid_file_ext(final_extension):
|
|
return extension_to_function.get(final_extension, file_io_to_text)(file)
|
|
|
|
# Either the file somehow has no name or the extension is not one that we recognize
|
|
if is_text_file(file):
|
|
return file_io_to_text(file)
|
|
|
|
raise ValueError("Unknown file extension and unknown text encoding")
|
|
|
|
except Exception as e:
|
|
if break_on_unprocessable:
|
|
raise RuntimeError(
|
|
f"Failed to process file {file_name or 'Unknown'}: {str(e)}"
|
|
) from e
|
|
logger.warning(f"Failed to process file {file_name or 'Unknown'}: {str(e)}")
|
|
return ""
|
|
|
|
|
|
def convert_docx_to_txt(
|
|
file: UploadFile, file_store: FileStore, file_path: str
|
|
) -> None:
|
|
file.file.seek(0)
|
|
docx_content = file.file.read()
|
|
doc = Document(BytesIO(docx_content))
|
|
|
|
# Extract text from the document
|
|
full_text = []
|
|
for para in doc.paragraphs:
|
|
full_text.append(para.text)
|
|
|
|
# Join the extracted text
|
|
text_content = "\n".join(full_text)
|
|
|
|
txt_file_path = docx_to_txt_filename(file_path)
|
|
file_store.save_file(
|
|
file_name=txt_file_path,
|
|
content=BytesIO(text_content.encode("utf-8")),
|
|
display_name=file.filename,
|
|
file_origin=FileOrigin.CONNECTOR,
|
|
file_type="text/plain",
|
|
)
|
|
|
|
|
|
def docx_to_txt_filename(file_path: str) -> str:
|
|
"""
|
|
Convert a .docx file path to its corresponding .txt file path.
|
|
"""
|
|
return file_path.rsplit(".", 1)[0] + ".txt"
|