mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-30 01:30:21 +02:00
docx bugfix
This commit is contained in:
parent
c3c9a0e57c
commit
0acd50b75d
@ -2,6 +2,7 @@ import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import zipfile
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
@ -567,9 +568,7 @@ def extract_text_and_images(
|
||||
return ExtractionResult(text_content="", embedded_images=[], metadata={})
|
||||
|
||||
|
||||
def convert_docx_to_txt(
|
||||
file: UploadFile, file_store: FileStore, file_path: str
|
||||
) -> None:
|
||||
def convert_docx_to_txt(file: UploadFile, file_store: FileStore) -> str:
|
||||
"""
|
||||
Helper to convert docx to a .txt file in the same filestore.
|
||||
"""
|
||||
@ -581,7 +580,7 @@ def convert_docx_to_txt(
|
||||
all_paras = [p.text for p in doc.paragraphs]
|
||||
text_content = "\n".join(all_paras)
|
||||
|
||||
text_file_name = docx_to_txt_filename(file_path)
|
||||
text_file_name = docx_to_txt_filename(file.filename or f"docx_{uuid.uuid4()}")
|
||||
file_store.save_file(
|
||||
file_name=text_file_name,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
@ -589,6 +588,7 @@ def convert_docx_to_txt(
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
return text_file_name
|
||||
|
||||
|
||||
def docx_to_txt_filename(file_path: str) -> str:
|
||||
|
@ -435,8 +435,7 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
|
||||
if file.content_type and file.content_type.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
|
||||
convert_docx_to_txt(file, file_store, file_path)
|
||||
file_path = convert_docx_to_txt(file, file_store)
|
||||
deduped_file_paths.append(file_path)
|
||||
continue
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user