From 0acd50b75d0f9d96cd1e93f81cabc956738ac7fc Mon Sep 17 00:00:00 2001 From: pablonyx Date: Fri, 4 Apr 2025 18:20:31 -0700 Subject: [PATCH] docx bugfix --- backend/onyx/file_processing/extract_file_text.py | 8 ++++---- backend/onyx/server/documents/connector.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index bab357aca..0905ada98 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -2,6 +2,7 @@ import io import json import os import re +import uuid import zipfile from collections.abc import Callable from collections.abc import Iterator @@ -567,9 +568,7 @@ def extract_text_and_images( return ExtractionResult(text_content="", embedded_images=[], metadata={}) -def convert_docx_to_txt( - file: UploadFile, file_store: FileStore, file_path: str -) -> None: +def convert_docx_to_txt(file: UploadFile, file_store: FileStore) -> str: """ Helper to convert docx to a .txt file in the same filestore. """ @@ -581,7 +580,7 @@ def convert_docx_to_txt( all_paras = [p.text for p in doc.paragraphs] text_content = "\n".join(all_paras) - text_file_name = docx_to_txt_filename(file_path) + text_file_name = docx_to_txt_filename(file.filename or f"docx_{uuid.uuid4()}") file_store.save_file( file_name=text_file_name, content=BytesIO(text_content.encode("utf-8")), @@ -589,6 +588,7 @@ def convert_docx_to_txt( file_origin=FileOrigin.CONNECTOR, file_type="text/plain", ) + return text_file_name def docx_to_txt_filename(file_path: str) -> str: diff --git a/backend/onyx/server/documents/connector.py b/backend/onyx/server/documents/connector.py index c2da0d7aa..fa8078457 100644 --- a/backend/onyx/server/documents/connector.py +++ b/backend/onyx/server/documents/connector.py @@ -435,8 +435,7 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp if file.content_type and file.content_type.startswith( "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): - file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename)) - convert_docx_to_txt(file, file_store, file_path) + file_path = convert_docx_to_txt(file, file_store) deduped_file_paths.append(file_path) continue