docx bugfix

This commit is contained in:
pablonyx 2025-04-04 18:20:31 -07:00
parent c3c9a0e57c
commit 0acd50b75d
2 changed files with 5 additions and 6 deletions

View File

@ -2,6 +2,7 @@ import io
import json import json
import os import os
import re import re
import uuid
import zipfile import zipfile
from collections.abc import Callable from collections.abc import Callable
from collections.abc import Iterator from collections.abc import Iterator
@ -567,9 +568,7 @@ def extract_text_and_images(
return ExtractionResult(text_content="", embedded_images=[], metadata={}) return ExtractionResult(text_content="", embedded_images=[], metadata={})
def convert_docx_to_txt( def convert_docx_to_txt(file: UploadFile, file_store: FileStore) -> str:
file: UploadFile, file_store: FileStore, file_path: str
) -> None:
""" """
Helper to convert docx to a .txt file in the same filestore. Helper to convert docx to a .txt file in the same filestore.
""" """
@ -581,7 +580,7 @@ def convert_docx_to_txt(
all_paras = [p.text for p in doc.paragraphs] all_paras = [p.text for p in doc.paragraphs]
text_content = "\n".join(all_paras) text_content = "\n".join(all_paras)
text_file_name = docx_to_txt_filename(file_path) text_file_name = docx_to_txt_filename(file.filename or f"docx_{uuid.uuid4()}")
file_store.save_file( file_store.save_file(
file_name=text_file_name, file_name=text_file_name,
content=BytesIO(text_content.encode("utf-8")), content=BytesIO(text_content.encode("utf-8")),
@ -589,6 +588,7 @@ def convert_docx_to_txt(
file_origin=FileOrigin.CONNECTOR, file_origin=FileOrigin.CONNECTOR,
file_type="text/plain", file_type="text/plain",
) )
return text_file_name
def docx_to_txt_filename(file_path: str) -> str: def docx_to_txt_filename(file_path: str) -> str:

View File

@ -435,8 +435,7 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
if file.content_type and file.content_type.startswith( if file.content_type and file.content_type.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
): ):
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename)) file_path = convert_docx_to_txt(file, file_store)
convert_docx_to_txt(file, file_store, file_path)
deduped_file_paths.append(file_path) deduped_file_paths.append(file_path)
continue continue