From 061dab7f3755f077954de60cb66834e0b49a5fcb Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Tue, 25 Jun 2024 10:34:03 -0700
Subject: [PATCH] Touchup (#1702)

---
 .../file_processing/extract_file_text.py       | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py
index 4a56eeec4..f96d4a415 100644
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -69,7 +69,7 @@ def check_file_ext_is_valid(ext: str) -> bool:
 def is_text_file(file: IO[bytes]) -> bool:
     """
     checks if the first 1024 bytes only contain printable or whitespace characters
-    if it does, then we say its a 'txt'
+    if it does, then we say its a plaintext file
     """
     raw_data = file.read(1024)
     text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
@@ -282,24 +282,22 @@ def extract_file_text(
         ".html": parse_html_page_basic,
     }
 
-    try:
+    def _process_file() -> str:
         if file_name:
             extension = get_file_ext(file_name)
             if check_file_ext_is_valid(extension):
                 return extension_to_function.get(extension, file_io_to_text)(file)
 
+        # Either the file somehow has no name or the extension is not one that we are familiar with
         if is_text_file(file):
             return file_io_to_text(file)
 
-        failure_string = "No file_name or known text encoding"
-        if break_on_unprocessable:
-            raise RuntimeError(failure_string)
-
-        logger.warning(failure_string)
-        return ""
+        raise ValueError("Unknown file extension and unknown text encoding")
 
+    try:
+        return _process_file()
     except Exception as e:
         if break_on_unprocessable:
-            raise RuntimeError(failure_string)
-        logger.warning(str(e))
+            raise RuntimeError(f"Failed to process file: {str(e)}") from e
+        logger.warning(f"Failed to process file: {str(e)}")
         return ""