Bugfix/chat images 2 (#4630)

* don't hardcode -1

* extra spaces

* fix binary data in blurb

* add note to binary handling

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
This commit is contained in:
rkuo-danswer
2025-04-29 18:29:10 -07:00
committed by GitHub
parent dd242c9926
commit 94de23fe87
6 changed files with 54 additions and 13 deletions

View File

@@ -96,9 +96,9 @@ from onyx.document_index.factory import get_default_document_index
from onyx.file_store.models import ChatFileType
from onyx.file_store.models import FileDescriptor
from onyx.file_store.models import InMemoryChatFile
from onyx.file_store.utils import get_user_files
from onyx.file_store.utils import load_all_chat_files
from onyx.file_store.utils import load_all_user_file_files
from onyx.file_store.utils import load_all_user_files
from onyx.file_store.utils import load_in_memory_chat_files
from onyx.file_store.utils import save_files
from onyx.llm.exceptions import GenAIDisabledException
from onyx.llm.factory import get_llms_for_persona
@@ -849,12 +849,12 @@ def stream_chat_message_objects(
user_file_files: list[UserFile] | None = None
if user_file_ids or user_folder_ids:
# Load user files
user_files = load_all_user_files(
user_files = load_in_memory_chat_files(
user_file_ids or [],
user_folder_ids or [],
db_session,
)
user_file_files = load_all_user_file_files(
user_file_files = get_user_files(
user_file_ids or [],
user_folder_ids or [],
db_session,

View File

@@ -155,6 +155,7 @@ def _apply_pruning(
section_idx_token_count: dict[int, int] = {}
ind = 0
final_section_ind = None
total_tokens = 0
for ind, section in enumerate(sections):

View File

@@ -870,7 +870,10 @@ def create_search_doc_from_user_file(
content_sample = associated_chat_file.content[:100]
# Remove null bytes which can cause SQL errors
content_sample = content_sample.replace(b"\x00", b"")
blurb = content_sample.decode("utf-8", errors="replace")
# NOTE(rkuo): this used to be "replace" instead of strict, but
# that would bypass the binary handling below
blurb = content_sample.decode("utf-8", errors="strict")
except Exception:
# If decoding fails completely, provide a generic description
blurb = f"[Binary file: {db_user_file.name}]"

View File

@@ -157,16 +157,32 @@ def load_user_file(file_id: int, db_session: Session) -> InMemoryChatFile:
)
def load_all_user_files(
def load_in_memory_chat_files(
user_file_ids: list[int],
user_folder_ids: list[int],
db_session: Session,
) -> list[InMemoryChatFile]:
"""
Loads the actual content of user files specified by individual IDs and those
within specified folder IDs into memory.
Args:
user_file_ids: A list of specific UserFile IDs to load.
user_folder_ids: A list of UserFolder IDs. All UserFiles within these folders will be loaded.
db_session: The SQLAlchemy database session.
Returns:
A list of InMemoryChatFile objects, each containing the file content (as bytes),
file ID, file type, and filename. Prioritizes loading plaintext versions if available.
"""
# Use parallel execution to load files concurrently
return cast(
list[InMemoryChatFile],
run_functions_tuples_in_parallel(
# 1. Load files specified by individual IDs
[(load_user_file, (file_id, db_session)) for file_id in user_file_ids]
)
# 2. Load all files within specified folders
+ [
file
for folder_id in user_folder_ids
@@ -175,24 +191,47 @@ def load_all_user_files(
)
def load_all_user_file_files(
def get_user_files(
user_file_ids: list[int],
user_folder_ids: list[int],
db_session: Session,
) -> list[UserFile]:
"""
Fetches UserFile database records based on provided file and folder IDs.
Args:
user_file_ids: A list of specific UserFile IDs to fetch.
user_folder_ids: A list of UserFolder IDs. All UserFiles within these folders will be fetched.
db_session: The SQLAlchemy database session.
Returns:
A list containing UserFile SQLAlchemy model objects corresponding to the
specified file IDs and all files within the specified folder IDs.
It does NOT return the actual file content.
"""
user_files: list[UserFile] = []
# 1. Fetch UserFile records for specific file IDs
for user_file_id in user_file_ids:
# Query the database for a UserFile with the matching ID
user_file = (
db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
)
# If found, add it to the list
if user_file is not None:
user_files.append(user_file)
# 2. Fetch UserFile records for all files within specified folder IDs
for user_folder_id in user_folder_ids:
# Query the database for all UserFiles belonging to the current folder ID
# and extend the list with the results
user_files.extend(
db_session.query(UserFile)
.filter(UserFile.folder_id == user_folder_id)
.all()
)
# 3. Return the combined list of UserFile database objects
return user_files

View File

@@ -42,6 +42,7 @@ from onyx.file_processing.html_utils import web_html_cleanup
from onyx.server.documents.connector import trigger_indexing_for_cc_pair
from onyx.server.documents.models import ConnectorBase
from onyx.server.documents.models import CredentialBase
from onyx.server.query_and_chat.chat_backend import RECENT_DOCS_FOLDER_ID
from onyx.server.user_documents.models import MessageResponse
from onyx.server.user_documents.models import UserFileSnapshot
from onyx.server.user_documents.models import UserFolderSnapshot
@@ -141,9 +142,6 @@ def get_folder(
return folder_snapshot
RECENT_DOCS_FOLDER_ID = -1
@router.post("/user/file/upload")
def upload_user_files(
files: List[UploadFile] = File(...),
@@ -157,7 +155,7 @@ def upload_user_files(
try:
# Use our consolidated function that handles indexing properly
user_files = upload_files_to_user_files_with_indexing(
files, folder_id or -1, user, db_session
files, folder_id or RECENT_DOCS_FOLDER_ID, user, db_session
)
return [UserFileSnapshot.from_model(user_file) for user_file in user_files]