mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-19 20:24:32 +02:00
Accept files with character issues (#781)
This commit is contained in:
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
import chardet
|
||||
from pypdf import PdfReader
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
@@ -85,13 +86,26 @@ def load_files_from_zip(
|
||||
yield file_info, file
|
||||
|
||||
|
||||
def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
|
||||
def detect_encoding(file_path: str | Path) -> str:
|
||||
with open(file_path, "rb") as file:
|
||||
raw_data = file.read(50000) # Read a portion of the file to guess encoding
|
||||
return chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||
|
||||
|
||||
def read_file(
|
||||
file_reader: IO[Any], encoding: str = "utf-8", errors: str = "replace"
|
||||
) -> tuple[str, dict]:
|
||||
metadata = {}
|
||||
file_content_raw = ""
|
||||
for ind, line in enumerate(file_reader):
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode("utf-8")
|
||||
line = str(line)
|
||||
try:
|
||||
line = line.decode(encoding) if isinstance(line, bytes) else line
|
||||
except UnicodeDecodeError:
|
||||
line = (
|
||||
line.decode(encoding, errors=errors)
|
||||
if isinstance(line, bytes)
|
||||
else line
|
||||
)
|
||||
|
||||
if ind == 0:
|
||||
metadata_or_none = extract_metadata(line)
|
||||
@@ -99,7 +113,6 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict]:
|
||||
metadata = metadata_or_none
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
|
@@ -8,6 +8,7 @@ from typing import IO
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import detect_encoding
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
@@ -31,11 +32,12 @@ def _open_files_at_location(
|
||||
if extension == ".zip":
|
||||
for file_info, file in load_files_from_zip(file_path, ignore_dirs=True):
|
||||
yield file_info.filename, file
|
||||
elif extension in [".txt", ".pdf", ".md", ".mdx"]:
|
||||
mode = "r"
|
||||
if extension == ".pdf":
|
||||
mode = "rb"
|
||||
with open(file_path, mode) as file:
|
||||
elif extension in [".txt", ".md", ".mdx"]:
|
||||
encoding = detect_encoding(file_path)
|
||||
with open(file_path, "r", encoding=encoding, errors="replace") as file:
|
||||
yield os.path.basename(file_path), file
|
||||
elif extension == ".pdf":
|
||||
with open(file_path, "rb") as file:
|
||||
yield os.path.basename(file_path), file
|
||||
else:
|
||||
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
|
||||
|
@@ -3,6 +3,7 @@ asyncpg==0.27.0
|
||||
atlassian-python-api==3.37.0
|
||||
beautifulsoup4==4.12.2
|
||||
celery==5.3.4
|
||||
chardet==5.2.0
|
||||
dask==2023.8.1
|
||||
distributed==2023.8.1
|
||||
python-dateutil==2.8.2
|
||||
|
Reference in New Issue
Block a user