From 9951fe13ba0d8d1cc743cc5a6d4523e9dd74af31 Mon Sep 17 00:00:00 2001
From: pablonyx <pablo@danswer.ai>
Date: Sun, 30 Mar 2025 12:28:49 -0700
Subject: [PATCH] Fix image input processing without LLMs (#4390)

* quick fix

* quick fix

* Revert "quick fix"

This reverts commit 906b29bd9b666884a3983ecdeb3f238bb535f1b0.

* nit
---
 backend/onyx/indexing/chunker.py           | 23 +---------------------
 backend/onyx/indexing/indexing_pipeline.py | 20 +++++++++++++------
 2 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/backend/onyx/indexing/chunker.py b/backend/onyx/indexing/chunker.py
index e84f40f0c..3054378ff 100644
--- a/backend/onyx/indexing/chunker.py
+++ b/backend/onyx/indexing/chunker.py
@@ -246,27 +246,6 @@ class Chunker:
         )
         chunks_list.append(new_chunk)
 
-    def _chunk_document(
-        self,
-        document: IndexingDocument,
-        title_prefix: str,
-        metadata_suffix_semantic: str,
-        metadata_suffix_keyword: str,
-        content_token_limit: int,
-    ) -> list[DocAwareChunk]:
-        """
-        Legacy method for backward compatibility.
-        Calls _chunk_document_with_sections with document.sections.
-        """
-        return self._chunk_document_with_sections(
-            document,
-            document.processed_sections,
-            title_prefix,
-            metadata_suffix_semantic,
-            metadata_suffix_keyword,
-            content_token_limit,
-        )
-
     def _chunk_document_with_sections(
         self,
         document: IndexingDocument,
@@ -286,7 +265,7 @@ class Chunker:
 
         for section_idx, section in enumerate(sections):
             # Get section text and other attributes
-            section_text = clean_text(section.text or "")
+            section_text = clean_text(str(section.text or ""))
             section_link_text = section.link or ""
             image_url = section.image_file_name
 
diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py
index 99401c709..5dc6d4b25 100644
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -464,7 +464,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
                 **document.dict(),
                 processed_sections=[
                     Section(
-                        text=section.text if isinstance(section, TextSection) else None,
+                        text=section.text if isinstance(section, TextSection) else "",
                         link=section.link,
                         image_file_name=section.image_file_name
                         if isinstance(section, ImageSection)
@@ -484,11 +484,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
         for section in document.sections:
             # For ImageSection, process and create base Section with both text and image_file_name
             if isinstance(section, ImageSection):
-                # Default section with image path preserved
+                # Default section with image path preserved - ensure text is always a string
                 processed_section = Section(
                     link=section.link,
                     image_file_name=section.image_file_name,
-                    text=None,  # Will be populated if summarization succeeds
+                    text="",  # Initialize with empty string
                 )
 
                 # Try to get image summary
@@ -531,13 +531,21 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
             # For TextSection, create a base Section with text and link
             elif isinstance(section, TextSection):
                 processed_section = Section(
-                    text=section.text, link=section.link, image_file_name=None
+                    text=section.text or "",  # Ensure text is always a string, not None
+                    link=section.link,
+                    image_file_name=None,
                 )
                 processed_sections.append(processed_section)
 
-            # If it's already a base Section (unlikely), just append it
+            # If it's already a base Section (unlikely), just append it with text validation
             else:
-                processed_sections.append(section)
+                # Ensure text is always a string
+                processed_section = Section(
+                    text=section.text if section.text is not None else "",
+                    link=section.link,
+                    image_file_name=section.image_file_name,
+                )
+                processed_sections.append(processed_section)
 
         # Create IndexingDocument with original sections and processed_sections
         indexed_document = IndexingDocument(