From fcb7f6fcc0fef2fe3c3492af3ba2c4d27c99f04e Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Tue, 28 Nov 2023 22:43:58 -0800 Subject: [PATCH] Accept files with character issues (#781) --- .../cross_connector_utils/file_utils.py | 23 +++++++++++++++---- backend/danswer/connectors/file/connector.py | 12 ++++++---- backend/requirements/default.txt | 1 + 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py index a86c5aa6f7f6..3bab0d160eaa 100644 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any from typing import IO +import chardet from pypdf import PdfReader from danswer.utils.logger import setup_logger @@ -85,13 +86,26 @@ def load_files_from_zip( yield file_info, file -def read_file(file_reader: IO[Any]) -> tuple[str, dict]: +def detect_encoding(file_path: str | Path) -> str: + with open(file_path, "rb") as file: + raw_data = file.read(50000) # Read a portion of the file to guess encoding + return chardet.detect(raw_data)["encoding"] or "utf-8" + + +def read_file( + file_reader: IO[Any], encoding: str = "utf-8", errors: str = "replace" +) -> tuple[str, dict]: metadata = {} file_content_raw = "" for ind, line in enumerate(file_reader): - if isinstance(line, bytes): - line = line.decode("utf-8") - line = str(line) + try: + line = line.decode(encoding) if isinstance(line, bytes) else line + except UnicodeDecodeError: + line = ( + line.decode(encoding, errors=errors) + if isinstance(line, bytes) + else line + ) if ind == 0: metadata_or_none = extract_metadata(line) @@ -99,7 +113,6 @@ def read_file(file_reader: IO[Any]) -> tuple[str, dict]: metadata = metadata_or_none else: file_content_raw += line - else: file_content_raw += line diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 3c3418d4c046..df28d7ed4d47 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -8,6 +8,7 @@ from typing import IO from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.file_utils import detect_encoding from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip from danswer.connectors.cross_connector_utils.file_utils import read_file from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file @@ -31,11 +32,12 @@ def _open_files_at_location( if extension == ".zip": for file_info, file in load_files_from_zip(file_path, ignore_dirs=True): yield file_info.filename, file - elif extension in [".txt", ".pdf", ".md", ".mdx"]: - mode = "r" - if extension == ".pdf": - mode = "rb" - with open(file_path, mode) as file: + elif extension in [".txt", ".md", ".mdx"]: + encoding = detect_encoding(file_path) + with open(file_path, "r", encoding=encoding, errors="replace") as file: + yield os.path.basename(file_path), file + elif extension == ".pdf": + with open(file_path, "rb") as file: yield os.path.basename(file_path), file else: logger.warning(f"Skipping file '{file_path}' with extension '{extension}'") diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 2834cf204924..630fac83eccd 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -3,6 +3,7 @@ asyncpg==0.27.0 atlassian-python-api==3.37.0 beautifulsoup4==4.12.2 celery==5.3.4 +chardet==5.2.0 dask==2023.8.1 distributed==2023.8.1 python-dateutil==2.8.2