diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index fbc4fdc52..efdeeb546 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -358,7 +358,13 @@ def extract_file_text( try: if get_unstructured_api_key(): - return unstructured_to_text(file, file_name) + try: + return unstructured_to_text(file, file_name) + except Exception as unstructured_error: + logger.error( + f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing." + ) + # Fall through to normal processing if file_name or extension: if extension is not None: diff --git a/backend/onyx/file_processing/unstructured.py b/backend/onyx/file_processing/unstructured.py index 4faa14a1a..3827c6891 100644 --- a/backend/onyx/file_processing/unstructured.py +++ b/backend/onyx/file_processing/unstructured.py @@ -52,7 +52,7 @@ def _sdk_partition_request( def unstructured_to_text(file: IO[Any], file_name: str) -> str: logger.debug(f"Starting to read file: {file_name}") - req = _sdk_partition_request(file, file_name, strategy="auto") + req = _sdk_partition_request(file, file_name, strategy="fast") unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key()) diff --git a/backend/onyx/server/query_and_chat/chat_backend.py b/backend/onyx/server/query_and_chat/chat_backend.py index 32c278dcd..f671ccfed 100644 --- a/backend/onyx/server/query_and_chat/chat_backend.py +++ b/backend/onyx/server/query_and_chat/chat_backend.py @@ -672,23 +672,25 @@ def upload_files_for_chat( else ChatFileType.PLAIN_TEXT ) + file_content = file.file.read() # Read the file content + if file_type == ChatFileType.IMAGE: - file_content = file.file + file_content_io = file.file # NOTE: Image conversion to JPEG used to be enforced here. # This was removed to: # 1. Preserve original file content for downloads # 2. Maintain transparency in formats like PNG # 3. Ameliorate issue with file conversion else: - file_content = io.BytesIO(file.file.read()) + file_content_io = io.BytesIO(file_content) new_content_type = file.content_type - # store the file (now JPEG for images) + # Store the file normally file_id = str(uuid.uuid4()) file_store.save_file( file_name=file_id, - content=file_content, + content=file_content_io, display_name=file.filename, file_origin=FileOrigin.CHAT_UPLOAD, file_type=new_content_type or file_type.value, @@ -698,7 +700,7 @@ def upload_files_for_chat( # to re-extract it every time we send a message if file_type == ChatFileType.DOC: extracted_text = extract_file_text( - file=file.file, + file=io.BytesIO(file_content), # use the bytes we already read file_name=file.filename or "", ) text_file_id = str(uuid.uuid4())