Unstructured fix (#3809)

* fix v1

* temporary patch for pdfs

* nit
This commit is contained in:
pablonyx 2025-01-28 08:46:27 -08:00 committed by GitHub
parent 74a472ece7
commit 2ad86aa9a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 15 additions and 7 deletions

View File

@ -358,7 +358,13 @@ def extract_file_text(
try:
if get_unstructured_api_key():
return unstructured_to_text(file, file_name)
try:
return unstructured_to_text(file, file_name)
except Exception as unstructured_error:
logger.error(
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
)
# Fall through to normal processing
if file_name or extension:
if extension is not None:

View File

@ -52,7 +52,7 @@ def _sdk_partition_request(
def unstructured_to_text(file: IO[Any], file_name: str) -> str:
logger.debug(f"Starting to read file: {file_name}")
req = _sdk_partition_request(file, file_name, strategy="auto")
req = _sdk_partition_request(file, file_name, strategy="fast")
unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key())

View File

@ -672,23 +672,25 @@ def upload_files_for_chat(
else ChatFileType.PLAIN_TEXT
)
file_content = file.file.read() # Read the file content
if file_type == ChatFileType.IMAGE:
file_content = file.file
file_content_io = file.file
# NOTE: Image conversion to JPEG used to be enforced here.
# This was removed to:
# 1. Preserve original file content for downloads
# 2. Maintain transparency in formats like PNG
# 3. Ameliorate issue with file conversion
else:
file_content = io.BytesIO(file.file.read())
file_content_io = io.BytesIO(file_content)
new_content_type = file.content_type
# store the file (now JPEG for images)
# Store the file normally
file_id = str(uuid.uuid4())
file_store.save_file(
file_name=file_id,
content=file_content,
content=file_content_io,
display_name=file.filename,
file_origin=FileOrigin.CHAT_UPLOAD,
file_type=new_content_type or file_type.value,
@ -698,7 +700,7 @@ def upload_files_for_chat(
# to re-extract it every time we send a message
if file_type == ChatFileType.DOC:
extracted_text = extract_file_text(
file=file.file,
file=io.BytesIO(file_content), # use the bytes we already read
file_name=file.filename or "",
)
text_file_id = str(uuid.uuid4())