Fix image input processing without LLMs (#4390)

* quick fix * quick fix * Revert "quick fix" This reverts commit 906b29bd9b666884a3983ecdeb3f238bb535f1b0. * nit
2025-04-22 22:44:21 +02:00 · 2025-03-30 12:28:49 -07:00 · 2025-03-30 12:28:49 -07:00 · 9951fe13ba
commit 9951fe13ba
parent 56f8ab927b
2 changed files with 15 additions and 28 deletions
--- a/backend/onyx/indexing/chunker.py
+++ b/backend/onyx/indexing/chunker.py
@ -246,27 +246,6 @@ class Chunker:
        )
        chunks_list.append(new_chunk)

-    def _chunk_document(
-        self,
-        document: IndexingDocument,
-        title_prefix: str,
-        metadata_suffix_semantic: str,
-        metadata_suffix_keyword: str,
-        content_token_limit: int,
-    ) -> list[DocAwareChunk]:
-        """
-        Legacy method for backward compatibility.
-        Calls _chunk_document_with_sections with document.sections.
-        """
-        return self._chunk_document_with_sections(
-            document,
-            document.processed_sections,
-            title_prefix,
-            metadata_suffix_semantic,
-            metadata_suffix_keyword,
-            content_token_limit,
-        )
-
    def _chunk_document_with_sections(
        self,
        document: IndexingDocument,
@ -286,7 +265,7 @@ class Chunker:

        for section_idx, section in enumerate(sections):
            # Get section text and other attributes
-            section_text = clean_text(section.text or "")
+            section_text = clean_text(str(section.text or ""))
            section_link_text = section.link or ""
            image_url = section.image_file_name

--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@ -464,7 +464,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
                **document.dict(),
                processed_sections=[
                    Section(
-                        text=section.text if isinstance(section, TextSection) else None,
+                        text=section.text if isinstance(section, TextSection) else "",
                        link=section.link,
                        image_file_name=section.image_file_name
                        if isinstance(section, ImageSection)
@ -484,11 +484,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
        for section in document.sections:
            # For ImageSection, process and create base Section with both text and image_file_name
            if isinstance(section, ImageSection):
-                # Default section with image path preserved
+                # Default section with image path preserved - ensure text is always a string
                processed_section = Section(
                    link=section.link,
                    image_file_name=section.image_file_name,
-                    text=None,  # Will be populated if summarization succeeds
+                    text="",  # Initialize with empty string
                )

                # Try to get image summary
@ -531,13 +531,21 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
            # For TextSection, create a base Section with text and link
            elif isinstance(section, TextSection):
                processed_section = Section(
-                    text=section.text, link=section.link, image_file_name=None
+                    text=section.text or "",  # Ensure text is always a string, not None
+                    link=section.link,
+                    image_file_name=None,
                )
                processed_sections.append(processed_section)

-            # If it's already a base Section (unlikely), just append it
+            # If it's already a base Section (unlikely), just append it with text validation
            else:
-                processed_sections.append(section)
+                # Ensure text is always a string
+                processed_section = Section(
+                    text=section.text if section.text is not None else "",
+                    link=section.link,
+                    image_file_name=section.image_file_name,
+                )
+                processed_sections.append(processed_section)

        # Create IndexingDocument with original sections and processed_sections
        indexed_document = IndexingDocument(