docx bugfix

This commit is contained in:
pablonyx 2025-04-04 18:20:31 -07:00
parent c3c9a0e57c
commit 0acd50b75d
2 changed files with 5 additions and 6 deletions

View File

@ -2,6 +2,7 @@ import io
import json
import os
import re
import uuid
import zipfile
from collections.abc import Callable
from collections.abc import Iterator
@ -567,9 +568,7 @@ def extract_text_and_images(
return ExtractionResult(text_content="", embedded_images=[], metadata={})
def convert_docx_to_txt(
file: UploadFile, file_store: FileStore, file_path: str
) -> None:
def convert_docx_to_txt(file: UploadFile, file_store: FileStore) -> str:
"""
Helper to convert docx to a .txt file in the same filestore.
"""
@ -581,7 +580,7 @@ def convert_docx_to_txt(
all_paras = [p.text for p in doc.paragraphs]
text_content = "\n".join(all_paras)
text_file_name = docx_to_txt_filename(file_path)
text_file_name = docx_to_txt_filename(file.filename or f"docx_{uuid.uuid4()}")
file_store.save_file(
file_name=text_file_name,
content=BytesIO(text_content.encode("utf-8")),
@ -589,6 +588,7 @@ def convert_docx_to_txt(
file_origin=FileOrigin.CONNECTOR,
file_type="text/plain",
)
return text_file_name
def docx_to_txt_filename(file_path: str) -> str:

View File

@ -435,8 +435,7 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
if file.content_type and file.content_type.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
convert_docx_to_txt(file, file_store, file_path)
file_path = convert_docx_to_txt(file, file_store)
deduped_file_paths.append(file_path)
continue