mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-22 22:44:21 +02:00
Fix image input processing without LLMs (#4390)
* quick fix * quick fix * Revert "quick fix" This reverts commit 906b29bd9b666884a3983ecdeb3f238bb535f1b0. * nit
This commit is contained in:
parent
56f8ab927b
commit
9951fe13ba
@ -246,27 +246,6 @@ class Chunker:
|
||||
)
|
||||
chunks_list.append(new_chunk)
|
||||
|
||||
def _chunk_document(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Legacy method for backward compatibility.
|
||||
Calls _chunk_document_with_sections with document.sections.
|
||||
"""
|
||||
return self._chunk_document_with_sections(
|
||||
document,
|
||||
document.processed_sections,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
content_token_limit,
|
||||
)
|
||||
|
||||
def _chunk_document_with_sections(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
@ -286,7 +265,7 @@ class Chunker:
|
||||
|
||||
for section_idx, section in enumerate(sections):
|
||||
# Get section text and other attributes
|
||||
section_text = clean_text(section.text or "")
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link_text = section.link or ""
|
||||
image_url = section.image_file_name
|
||||
|
||||
|
@ -464,7 +464,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
**document.dict(),
|
||||
processed_sections=[
|
||||
Section(
|
||||
text=section.text if isinstance(section, TextSection) else None,
|
||||
text=section.text if isinstance(section, TextSection) else "",
|
||||
link=section.link,
|
||||
image_file_name=section.image_file_name
|
||||
if isinstance(section, ImageSection)
|
||||
@ -484,11 +484,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
for section in document.sections:
|
||||
# For ImageSection, process and create base Section with both text and image_file_name
|
||||
if isinstance(section, ImageSection):
|
||||
# Default section with image path preserved
|
||||
# Default section with image path preserved - ensure text is always a string
|
||||
processed_section = Section(
|
||||
link=section.link,
|
||||
image_file_name=section.image_file_name,
|
||||
text=None, # Will be populated if summarization succeeds
|
||||
text="", # Initialize with empty string
|
||||
)
|
||||
|
||||
# Try to get image summary
|
||||
@ -531,13 +531,21 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
# For TextSection, create a base Section with text and link
|
||||
elif isinstance(section, TextSection):
|
||||
processed_section = Section(
|
||||
text=section.text, link=section.link, image_file_name=None
|
||||
text=section.text or "", # Ensure text is always a string, not None
|
||||
link=section.link,
|
||||
image_file_name=None,
|
||||
)
|
||||
processed_sections.append(processed_section)
|
||||
|
||||
# If it's already a base Section (unlikely), just append it
|
||||
# If it's already a base Section (unlikely), just append it with text validation
|
||||
else:
|
||||
processed_sections.append(section)
|
||||
# Ensure text is always a string
|
||||
processed_section = Section(
|
||||
text=section.text if section.text is not None else "",
|
||||
link=section.link,
|
||||
image_file_name=section.image_file_name,
|
||||
)
|
||||
processed_sections.append(processed_section)
|
||||
|
||||
# Create IndexingDocument with original sections and processed_sections
|
||||
indexed_document = IndexingDocument(
|
||||
|
Loading…
x
Reference in New Issue
Block a user