Bugfix/chat images 2 (#4630)

* don't hardcode -1 * extra spaces * fix binary data in blurb * add note to binary handling --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-09-13 18:12:14 +02:00 · 2025-04-29 18:29:10 -07:00
parent dd242c9926
commit 94de23fe87
6 changed files with 54 additions and 13 deletions
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -96,9 +96,9 @@ from onyx.document_index.factory import get_default_document_index
 from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import FileDescriptor
 from onyx.file_store.models import InMemoryChatFile
+from onyx.file_store.utils import get_user_files
 from onyx.file_store.utils import load_all_chat_files
-from onyx.file_store.utils import load_all_user_file_files
-from onyx.file_store.utils import load_all_user_files
+from onyx.file_store.utils import load_in_memory_chat_files
 from onyx.file_store.utils import save_files
 from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.factory import get_llms_for_persona
@@ -849,12 +849,12 @@ def stream_chat_message_objects(
        user_file_files: list[UserFile] | None = None
        if user_file_ids or user_folder_ids:
            # Load user files
-            user_files = load_all_user_files(
+            user_files = load_in_memory_chat_files(
                user_file_ids or [],
                user_folder_ids or [],
                db_session,
            )
-            user_file_files = load_all_user_file_files(
+            user_file_files = get_user_files(
                user_file_ids or [],
                user_folder_ids or [],
                db_session,
--- a/backend/onyx/chat/prune_and_merge.py
+++ b/backend/onyx/chat/prune_and_merge.py
@@ -155,6 +155,7 @@ def _apply_pruning(

    section_idx_token_count: dict[int, int] = {}

+    ind = 0
    final_section_ind = None
    total_tokens = 0
    for ind, section in enumerate(sections):
--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -870,7 +870,10 @@ def create_search_doc_from_user_file(
            content_sample = associated_chat_file.content[:100]
            # Remove null bytes which can cause SQL errors
            content_sample = content_sample.replace(b"\x00", b"")
-            blurb = content_sample.decode("utf-8", errors="replace")
+
+            # NOTE(rkuo): this used to be "replace" instead of strict, but
+            # that would bypass the binary handling below
+            blurb = content_sample.decode("utf-8", errors="strict")
        except Exception:
            # If decoding fails completely, provide a generic description
            blurb = f"[Binary file: {db_user_file.name}]"
--- a/backend/onyx/file_store/utils.py
+++ b/backend/onyx/file_store/utils.py
@@ -157,16 +157,32 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
        )


-def load_all_user_files(
+def load_in_memory_chat_files(
    user_file_ids: list[int],
    user_folder_ids: list[int],
    db_session: Session,
 ) -> list[InMemoryChatFile]:
+    """
+    Loads the actual content of user files specified by individual IDs and those
+    within specified folder IDs into memory.
+
+    Args:
+        user_file_ids: A list of specific UserFile IDs to load.
+        user_folder_ids: A list of UserFolder IDs. All UserFiles within these folders will be loaded.
+        db_session: The SQLAlchemy database session.
+
+    Returns:
+        A list of InMemoryChatFile objects, each containing the file content (as bytes),
+        file ID, file type, and filename. Prioritizes loading plaintext versions if available.
+    """
+    # Use parallel execution to load files concurrently
    return cast(
        list[InMemoryChatFile],
        run_functions_tuples_in_parallel(
+            # 1. Load files specified by individual IDs
            [(load_user_file, (file_id, db_session)) for file_id in user_file_ids]
        )
+        # 2. Load all files within specified folders
        + [
            file
            for folder_id in user_folder_ids
@@ -175,24 +191,47 @@ def load_all_user_files(
    )


-def load_all_user_file_files(
+def get_user_files(
    user_file_ids: list[int],
    user_folder_ids: list[int],
    db_session: Session,
 ) -> list[UserFile]:
+    """
+    Fetches UserFile database records based on provided file and folder IDs.
+
+    Args:
+        user_file_ids: A list of specific UserFile IDs to fetch.
+        user_folder_ids: A list of UserFolder IDs. All UserFiles within these folders will be fetched.
+        db_session: The SQLAlchemy database session.
+
+    Returns:
+        A list containing UserFile SQLAlchemy model objects corresponding to the
+        specified file IDs and all files within the specified folder IDs.
+        It does NOT return the actual file content.
+    """
    user_files: list[UserFile] = []
+
+    # 1. Fetch UserFile records for specific file IDs
    for user_file_id in user_file_ids:
+        # Query the database for a UserFile with the matching ID
        user_file = (
            db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
        )
+        # If found, add it to the list
        if user_file is not None:
            user_files.append(user_file)
+
+    # 2. Fetch UserFile records for all files within specified folder IDs
    for user_folder_id in user_folder_ids:
+        # Query the database for all UserFiles belonging to the current folder ID
+        # and extend the list with the results
        user_files.extend(
            db_session.query(UserFile)
            .filter(UserFile.folder_id == user_folder_id)
            .all()
        )
+
+    # 3. Return the combined list of UserFile database objects
    return user_files


--- a/backend/onyx/server/user_documents/api.py
+++ b/backend/onyx/server/user_documents/api.py
@@ -42,6 +42,7 @@ from onyx.file_processing.html_utils import web_html_cleanup
 from onyx.server.documents.connector import trigger_indexing_for_cc_pair
 from onyx.server.documents.models import ConnectorBase
 from onyx.server.documents.models import CredentialBase
+from onyx.server.query_and_chat.chat_backend import RECENT_DOCS_FOLDER_ID
 from onyx.server.user_documents.models import MessageResponse
 from onyx.server.user_documents.models import UserFileSnapshot
 from onyx.server.user_documents.models import UserFolderSnapshot
@@ -141,9 +142,6 @@ def get_folder(
    return folder_snapshot


-RECENT_DOCS_FOLDER_ID = -1
-
-
@router.post("/user/file/upload")
 def upload_user_files(
    files: List[UploadFile] = File(...),
@@ -157,7 +155,7 @@ def upload_user_files(
    try:
        # Use our consolidated function that handles indexing properly
        user_files = upload_files_to_user_files_with_indexing(
-            files, folder_id or -1, user, db_session
+            files, folder_id or RECENT_DOCS_FOLDER_ID, user, db_session
        )

        return [UserFileSnapshot.from_model(user_file) for user_file in user_files]