From 9951fe13ba0d8d1cc743cc5a6d4523e9dd74af31 Mon Sep 17 00:00:00 2001 From: pablonyx Date: Sun, 30 Mar 2025 12:28:49 -0700 Subject: [PATCH] Fix image input processing without LLMs (#4390) * quick fix * quick fix * Revert "quick fix" This reverts commit 906b29bd9b666884a3983ecdeb3f238bb535f1b0. * nit --- backend/onyx/indexing/chunker.py | 23 +--------------------- backend/onyx/indexing/indexing_pipeline.py | 20 +++++++++++++------ 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/backend/onyx/indexing/chunker.py b/backend/onyx/indexing/chunker.py index e84f40f0c..3054378ff 100644 --- a/backend/onyx/indexing/chunker.py +++ b/backend/onyx/indexing/chunker.py @@ -246,27 +246,6 @@ class Chunker: ) chunks_list.append(new_chunk) - def _chunk_document( - self, - document: IndexingDocument, - title_prefix: str, - metadata_suffix_semantic: str, - metadata_suffix_keyword: str, - content_token_limit: int, - ) -> list[DocAwareChunk]: - """ - Legacy method for backward compatibility. - Calls _chunk_document_with_sections with document.sections. - """ - return self._chunk_document_with_sections( - document, - document.processed_sections, - title_prefix, - metadata_suffix_semantic, - metadata_suffix_keyword, - content_token_limit, - ) - def _chunk_document_with_sections( self, document: IndexingDocument, @@ -286,7 +265,7 @@ class Chunker: for section_idx, section in enumerate(sections): # Get section text and other attributes - section_text = clean_text(section.text or "") + section_text = clean_text(str(section.text or "")) section_link_text = section.link or "" image_url = section.image_file_name diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index 99401c709..5dc6d4b25 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -464,7 +464,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]: **document.dict(), processed_sections=[ Section( - text=section.text if isinstance(section, TextSection) else None, + text=section.text if isinstance(section, TextSection) else "", link=section.link, image_file_name=section.image_file_name if isinstance(section, ImageSection) @@ -484,11 +484,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]: for section in document.sections: # For ImageSection, process and create base Section with both text and image_file_name if isinstance(section, ImageSection): - # Default section with image path preserved + # Default section with image path preserved - ensure text is always a string processed_section = Section( link=section.link, image_file_name=section.image_file_name, - text=None, # Will be populated if summarization succeeds + text="", # Initialize with empty string ) # Try to get image summary @@ -531,13 +531,21 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]: # For TextSection, create a base Section with text and link elif isinstance(section, TextSection): processed_section = Section( - text=section.text, link=section.link, image_file_name=None + text=section.text or "", # Ensure text is always a string, not None + link=section.link, + image_file_name=None, ) processed_sections.append(processed_section) - # If it's already a base Section (unlikely), just append it + # If it's already a base Section (unlikely), just append it with text validation else: - processed_sections.append(section) + # Ensure text is always a string + processed_section = Section( + text=section.text if section.text is not None else "", + link=section.link, + image_file_name=section.image_file_name, + ) + processed_sections.append(processed_section) # Create IndexingDocument with original sections and processed_sections indexed_document = IndexingDocument(