From 061dab7f3755f077954de60cb66834e0b49a5fcb Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Tue, 25 Jun 2024 10:34:03 -0700 Subject: [PATCH] Touchup (#1702) --- .../file_processing/extract_file_text.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 4a56eeec4..f96d4a415 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -69,7 +69,7 @@ def check_file_ext_is_valid(ext: str) -> bool: def is_text_file(file: IO[bytes]) -> bool: """ checks if the first 1024 bytes only contain printable or whitespace characters - if it does, then we say its a 'txt' + if it does, then we say its a plaintext file """ raw_data = file.read(1024) text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}) @@ -282,24 +282,22 @@ def extract_file_text( ".html": parse_html_page_basic, } - try: + def _process_file() -> str: if file_name: extension = get_file_ext(file_name) if check_file_ext_is_valid(extension): return extension_to_function.get(extension, file_io_to_text)(file) + # Either the file somehow has no name or the extension is not one that we are familiar with if is_text_file(file): return file_io_to_text(file) - failure_string = "No file_name or known text encoding" - if break_on_unprocessable: - raise RuntimeError(failure_string) - - logger.warning(failure_string) - return "" + raise ValueError("Unknown file extension and unknown text encoding") + try: + return _process_file() except Exception as e: if break_on_unprocessable: - raise RuntimeError(failure_string) - logger.warning(str(e)) + raise RuntimeError(f"Failed to process file: {str(e)}") from e + logger.warning(f"Failed to process file: {str(e)}") return ""