Accept files with character issues (#781)

This commit is contained in:
Yuhong Sun
2023-11-28 22:43:58 -08:00
committed by GitHub
parent 429016d4a2
commit fcb7f6fcc0
3 changed files with 26 additions and 10 deletions

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from typing import Any
from typing import IO
import chardet
from pypdf import PdfReader
from danswer.utils.logger import setup_logger
@@ -85,13 +86,26 @@ def load_files_from_zip(
yield file_info, file
def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
def detect_encoding(file_path: str | Path) -> str:
with open(file_path, "rb") as file:
raw_data = file.read(50000) # Read a portion of the file to guess encoding
return chardet.detect(raw_data)["encoding"] or "utf-8"
def read_file(
file_reader: IO[Any], encoding: str = "utf-8", errors: str = "replace"
) -> tuple[str, dict]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file_reader):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = str(line)
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
line = (
line.decode(encoding, errors=errors)
if isinstance(line, bytes)
else line
)
if ind == 0:
metadata_or_none = extract_metadata(line)
@@ -99,7 +113,6 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line

View File

@@ -8,6 +8,7 @@ from typing import IO
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
@@ -31,11 +32,12 @@ def _open_files_at_location(
if extension == ".zip":
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
yield file_info.filename, file
elif extension in [".txt", ".pdf", ".md", ".mdx"]:
mode = "r"
if extension == ".pdf":
mode = "rb"
with open(file_path, mode) as file:
elif extension in [".txt", ".md", ".mdx"]:
encoding = detect_encoding(file_path)
with open(file_path, "r", encoding=encoding, errors="replace") as file:
yield os.path.basename(file_path), file
elif extension == ".pdf":
with open(file_path, "rb") as file:
yield os.path.basename(file_path), file
else:
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")

View File

@@ -3,6 +3,7 @@ asyncpg==0.27.0
atlassian-python-api==3.37.0
beautifulsoup4==4.12.2
celery==5.3.4
chardet==5.2.0
dask==2023.8.1
distributed==2023.8.1
python-dateutil==2.8.2