Unstructured fix (#3809)

* fix v1 * temporary patch for pdfs * nit
2025-04-07 11:28:09 +02:00 · 2025-01-28 08:46:27 -08:00 · 2025-01-28 08:46:27 -08:00 · 2ad86aa9a6
commit 2ad86aa9a6
parent 74a472ece7
3 changed files with 15 additions and 7 deletions
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@ -358,7 +358,13 @@ def extract_file_text(

    try:
        if get_unstructured_api_key():
-            return unstructured_to_text(file, file_name)
+            try:
+                return unstructured_to_text(file, file_name)
+            except Exception as unstructured_error:
+                logger.error(
+                    f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
+                )
+                # Fall through to normal processing

        if file_name or extension:
            if extension is not None:
--- a/backend/onyx/file_processing/unstructured.py
+++ b/backend/onyx/file_processing/unstructured.py
@ -52,7 +52,7 @@ def _sdk_partition_request(

 def unstructured_to_text(file: IO[Any], file_name: str) -> str:
    logger.debug(f"Starting to read file: {file_name}")
-    req = _sdk_partition_request(file, file_name, strategy="auto")
+    req = _sdk_partition_request(file, file_name, strategy="fast")

    unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key())

--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@ -672,23 +672,25 @@ def upload_files_for_chat(
            else ChatFileType.PLAIN_TEXT
        )

+        file_content = file.file.read()  # Read the file content
+
        if file_type == ChatFileType.IMAGE:
-            file_content = file.file
+            file_content_io = file.file
            # NOTE: Image conversion to JPEG used to be enforced here.
            # This was removed to:
            # 1. Preserve original file content for downloads
            # 2. Maintain transparency in formats like PNG
            # 3. Ameliorate issue with file conversion
        else:
-            file_content = io.BytesIO(file.file.read())
+            file_content_io = io.BytesIO(file_content)

        new_content_type = file.content_type

-        # store the file (now JPEG for images)
+        # Store the file normally
        file_id = str(uuid.uuid4())
        file_store.save_file(
            file_name=file_id,
-            content=file_content,
+            content=file_content_io,
            display_name=file.filename,
            file_origin=FileOrigin.CHAT_UPLOAD,
            file_type=new_content_type or file_type.value,
@ -698,7 +700,7 @@ def upload_files_for_chat(
        # to re-extract it every time we send a message
        if file_type == ChatFileType.DOC:
            extracted_text = extract_file_text(
-                file=file.file,
+                file=io.BytesIO(file_content),  # use the bytes we already read
                file_name=file.filename or "",
            )
            text_file_id = str(uuid.uuid4())