From b9561fc46ceacd57215de1be7ed7eb689dfb2ae1 Mon Sep 17 00:00:00 2001 From: pablonyx Date: Fri, 24 Jan 2025 12:52:58 -0800 Subject: [PATCH] Unzip files + no double x (#3767) * unzip files * quick nit * quick nit * nit --- backend/onyx/server/documents/connector.py | 36 +++++++++++++++++++++ web/src/components/chat_search/TextView.tsx | 6 +++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py index 4182568fbcbf..21c5c586b833 100644 --- a/backend/onyx/server/documents/connector.py +++ b/backend/onyx/server/documents/connector.py @@ -1,5 +1,8 @@ +import mimetypes import os import uuid +import zipfile +from io import BytesIO from typing import cast from fastapi import APIRouter @@ -386,10 +389,43 @@ def upload_files( for file in files: if not file.filename: raise HTTPException(status_code=400, detail="File name cannot be empty") + + # Skip directories and known macOS metadata entries + def should_process_file(file_path: str) -> bool: + normalized_path = os.path.normpath(file_path) + return not any(part.startswith(".") for part in normalized_path.split(os.sep)) + try: file_store = get_default_file_store(db_session) deduped_file_paths = [] + for file in files: + if file.content_type and file.content_type.startswith("application/zip"): + with zipfile.ZipFile(file.file, "r") as zf: + for file_info in zf.namelist(): + if zf.getinfo(file_info).is_dir(): + continue + + if not should_process_file(file_info): + continue + + sub_file_bytes = zf.read(file_info) + sub_file_name = os.path.join(str(uuid.uuid4()), file_info) + deduped_file_paths.append(sub_file_name) + + mime_type, __ = mimetypes.guess_type(file_info) + if mime_type is None: + mime_type = "application/octet-stream" + + file_store.save_file( + file_name=sub_file_name, + content=BytesIO(sub_file_bytes), + display_name=os.path.basename(file_info), + file_origin=FileOrigin.CONNECTOR, + file_type=mime_type, + ) + continue + file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename)) deduped_file_paths.append(file_path) file_store.save_file( diff --git a/web/src/components/chat_search/TextView.tsx b/web/src/components/chat_search/TextView.tsx index 59f01204ec78..a70da405f4e7 100644 --- a/web/src/components/chat_search/TextView.tsx +++ b/web/src/components/chat_search/TextView.tsx @@ -36,6 +36,7 @@ export default function TextView({ "text/plain", "text/x-rst", "text/x-org", + "txt", ]; return markdownFormats.some((format) => mimeType.startsWith(format)); }; @@ -117,7 +118,10 @@ export default function TextView({ return ( - + {fileName}