Add image support for search (#4090)

* add support for image search * quick fix up * k * k * k * k * nit * quick fix for connector tests
2025-07-28 13:53:28 +02:00 · 2025-03-05 09:44:18 -08:00
parent f731beca1f
commit 20f2b9b2bb
36 changed files with 1857 additions and 589 deletions
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -9,15 +9,17 @@ from email.parser import Parser as EmailParser
 from io import BytesIO
 from pathlib import Path
 from typing import Any
-from typing import Dict
 from typing import IO
+from typing import List
+from typing import Tuple

 import chardet
 import docx  # type: ignore
 import openpyxl  # type: ignore
 import pptx  # type: ignore
-from docx import Document
+from docx import Document as DocxDocument
 from fastapi import UploadFile
+from PIL import Image
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError

@@ -31,10 +33,8 @@ from onyx.utils.logger import setup_logger

 logger = setup_logger()

-
 TEXT_SECTION_SEPARATOR = "\n\n"

-
 PLAIN_TEXT_FILE_EXTENSIONS = [
    ".txt",
    ".md",
@@ -49,7 +49,6 @@ PLAIN_TEXT_FILE_EXTENSIONS = [
    ".yaml",
 ]

-
 VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".pdf",
    ".docx",
@@ -58,6 +57,16 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".eml",
    ".epub",
    ".html",
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".webp",
+]
+
+IMAGE_MEDIA_TYPES = [
+    "image/png",
+    "image/jpeg",
+    "image/webp",
 ]


@@ -67,11 +76,13 @@ def is_text_file_extension(file_name: str) -> bool:

 def get_file_ext(file_path_or_name: str | Path) -> str:
    _, extension = os.path.splitext(file_path_or_name)
-    # standardize all extensions to be lowercase so that checks against
-    # VALID_FILE_EXTENSIONS and similar will work as intended
    return extension.lower()


+def is_valid_media_type(media_type: str) -> bool:
+    return media_type in IMAGE_MEDIA_TYPES
+
+
 def is_valid_file_ext(ext: str) -> bool:
    return ext in VALID_FILE_EXTENSIONS

@@ -79,17 +90,18 @@ def is_valid_file_ext(ext: str) -> bool:
 def is_text_file(file: IO[bytes]) -> bool:
    """
    checks if the first 1024 bytes only contain printable or whitespace characters
-    if it does, then we say its a plaintext file
+    if it does, then we say it's a plaintext file
    """
    raw_data = file.read(1024)
+    file.seek(0)
    text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
    return all(c in text_chars for c in raw_data)


 def detect_encoding(file: IO[bytes]) -> str:
    raw_data = file.read(50000)
-    encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
    file.seek(0)
+    encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
    return encoding


@@ -99,14 +111,14 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
    )


-# To include additional metadata in the search index, add a .onyx_metadata.json file
-# to the zip file. This file should contain a list of objects with the following format:
-# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
 def load_files_from_zip(
    zip_file_io: IO,
    ignore_macos_resource_fork_files: bool = True,
    ignore_dirs: bool = True,
 ) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
+    """
+    If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
+    """
    with zipfile.ZipFile(zip_file_io, "r") as zip_file:
        zip_metadata = {}
        try:
@@ -118,24 +130,31 @@ def load_files_from_zip(
                        # convert list of dicts to dict of dicts
                        zip_metadata = {d["filename"]: d for d in zip_metadata}
                except json.JSONDecodeError:
-                    logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
+                    logger.warning(f"Unable to load {DANSWER_METADATA_FILENAME}")
        except KeyError:
            logger.info(f"No {DANSWER_METADATA_FILENAME} file")

        for file_info in zip_file.infolist():
-            with zip_file.open(file_info.filename, "r") as file:
-                if ignore_dirs and file_info.is_dir():
-                    continue
+            if ignore_dirs and file_info.is_dir():
+                continue

-                if (
-                    ignore_macos_resource_fork_files
-                    and is_macos_resource_fork_file(file_info.filename)
-                ) or file_info.filename == DANSWER_METADATA_FILENAME:
-                    continue
-                yield file_info, file, zip_metadata.get(file_info.filename, {})
+            if (
+                ignore_macos_resource_fork_files
+                and is_macos_resource_fork_file(file_info.filename)
+            ) or file_info.filename == DANSWER_METADATA_FILENAME:
+                continue
+
+            with zip_file.open(file_info.filename, "r") as subfile:
+                yield file_info, subfile, zip_metadata.get(file_info.filename, {})


 def _extract_onyx_metadata(line: str) -> dict | None:
+    """
+    Example: first line has:
+        <!-- DANSWER_METADATA={"title": "..."} -->
+      or
+        #DANSWER_METADATA={"title":"..."}
+    """
    html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
    hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"

@@ -161,9 +180,13 @@ def read_text_file(
    errors: str = "replace",
    ignore_onyx_metadata: bool = True,
 ) -> tuple[str, dict]:
+    """
+    For plain text files. Optionally extracts Onyx metadata from the first line.
+    """
    metadata = {}
    file_content_raw = ""
    for ind, line in enumerate(file):
+        # decode
        try:
            line = line.decode(encoding) if isinstance(line, bytes) else line
        except UnicodeDecodeError:
@@ -173,131 +196,132 @@ def read_text_file(
                else line
            )

-        if ind == 0:
-            metadata_or_none = (
-                None if ignore_onyx_metadata else _extract_onyx_metadata(line)
-            )
-            if metadata_or_none is not None:
-                metadata = metadata_or_none
-            else:
-                file_content_raw += line
-        else:
-            file_content_raw += line
+        # optionally parse metadata in the first line
+        if ind == 0 and not ignore_onyx_metadata:
+            potential_meta = _extract_onyx_metadata(line)
+            if potential_meta is not None:
+                metadata = potential_meta
+                continue
+
+        file_content_raw += line

    return file_content_raw, metadata


 def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
-    """Extract text from a PDF file."""
-    # Return only the extracted text from read_pdf_file
-    text, _ = read_pdf_file(file, pdf_pass)
+    """
+    Extract text from a PDF. For embedded images, a more complex approach is needed.
+    This is a minimal approach returning text only.
+    """
+    text, _, _ = read_pdf_file(file, pdf_pass)
    return text


 def read_pdf_file(
-    file: IO[Any],
-    pdf_pass: str | None = None,
-) -> tuple[str, dict]:
-    metadata: Dict[str, Any] = {}
+    file: IO[Any], pdf_pass: str | None = None, extract_images: bool = False
+) -> tuple[str, dict, list[tuple[bytes, str]]]:
+    """
+    Returns the text, basic PDF metadata, and optionally extracted images.
+    """
+    metadata: dict[str, Any] = {}
+    extracted_images: list[tuple[bytes, str]] = []
    try:
        pdf_reader = PdfReader(file)

-        # If marked as encrypted and a password is provided, try to decrypt
        if pdf_reader.is_encrypted and pdf_pass is not None:
            decrypt_success = False
-            if pdf_pass is not None:
-                try:
-                    decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
-                except Exception:
-                    logger.error("Unable to decrypt pdf")
+            try:
+                decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
+            except Exception:
+                logger.error("Unable to decrypt pdf")

            if not decrypt_success:
-                # By user request, keep files that are unreadable just so they
-                # can be discoverable by title.
-                return "", metadata
+                return "", metadata, []
        elif pdf_reader.is_encrypted:
-            logger.warning("No Password available to decrypt pdf, returning empty")
-            return "", metadata
+            logger.warning("No Password for an encrypted PDF, returning empty text.")
+            return "", metadata, []

-        # Extract metadata from the PDF, removing leading '/' from keys if present
-        # This standardizes the metadata keys for consistency
-        metadata = {}
+        # Basic PDF metadata
        if pdf_reader.metadata is not None:
            for key, value in pdf_reader.metadata.items():
                clean_key = key.lstrip("/")
                if isinstance(value, str) and value.strip():
                    metadata[clean_key] = value
-
                elif isinstance(value, list) and all(
                    isinstance(item, str) for item in value
                ):
                    metadata[clean_key] = ", ".join(value)

-        return (
-            TEXT_SECTION_SEPARATOR.join(
-                page.extract_text() for page in pdf_reader.pages
-            ),
-            metadata,
+        text = TEXT_SECTION_SEPARATOR.join(
+            page.extract_text() for page in pdf_reader.pages
        )
+
+        if extract_images:
+            for page_num, page in enumerate(pdf_reader.pages):
+                for image_file_object in page.images:
+                    image = Image.open(io.BytesIO(image_file_object.data))
+                    img_byte_arr = io.BytesIO()
+                    image.save(img_byte_arr, format=image.format)
+                    img_bytes = img_byte_arr.getvalue()
+
+                    image_name = (
+                        f"page_{page_num + 1}_image_{image_file_object.name}."
+                        f"{image.format.lower() if image.format else 'png'}"
+                    )
+                    extracted_images.append((img_bytes, image_name))
+
+        return text, metadata, extracted_images
+
    except PdfStreamError:
-        logger.exception("PDF file is not a valid PDF")
+        logger.exception("Invalid PDF file")
    except Exception:
        logger.exception("Failed to read PDF")

-    # File is still discoverable by title
-    # but the contents are not included as they cannot be parsed
-    return "", metadata
+    return "", metadata, []


-def docx_to_text(file: IO[Any]) -> str:
-    def is_simple_table(table: docx.table.Table) -> bool:
-        for row in table.rows:
-            # No omitted cells
-            if row.grid_cols_before > 0 or row.grid_cols_after > 0:
-                return False
-
-            # No nested tables
-            if any(cell.tables for cell in row.cells):
-                return False
-
-        return True
-
-    def extract_cell_text(cell: docx.table._Cell) -> str:
-        cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
-        return " ".join(p for p in cell_paragraphs if p) or "N/A"
-
+def docx_to_text_and_images(
+    file: IO[Any],
+) -> Tuple[str, List[Tuple[bytes, str]]]:
+    """
+    Extract text from a docx. If embed_images=True, also extract inline images.
+    Return (text_content, list_of_images).
+    """
    paragraphs = []
+    embedded_images: List[Tuple[bytes, str]] = []
+
    doc = docx.Document(file)
-    for item in doc.iter_inner_content():
-        if isinstance(item, docx.text.paragraph.Paragraph):
-            paragraphs.append(item.text)

-        elif isinstance(item, docx.table.Table):
-            if not item.rows or not is_simple_table(item):
-                continue
+    # Grab text from paragraphs
+    for paragraph in doc.paragraphs:
+        paragraphs.append(paragraph.text)

-            # Every row is a new line, joined with a single newline
-            table_content = "\n".join(
-                [
-                    ",\t".join(extract_cell_text(cell) for cell in row.cells)
-                    for row in item.rows
-                ]
-            )
-            paragraphs.append(table_content)
+    # Reset position so we can re-load the doc (python-docx has read the stream)
+    # Note: if python-docx has fully consumed the stream, you may need to open it again from memory.
+    # For large docs, a more robust approach is needed.
+    # This is a simplified example.

-    # Docx already has good spacing between paragraphs
-    return "\n".join(paragraphs)
+    for rel_id, rel in doc.part.rels.items():
+        if "image" in rel.reltype:
+            # image is typically in rel.target_part.blob
+            image_bytes = rel.target_part.blob
+            image_name = rel.target_part.partname
+            # store
+            embedded_images.append((image_bytes, os.path.basename(str(image_name))))
+
+    text_content = "\n".join(paragraphs)
+    return text_content, embedded_images


 def pptx_to_text(file: IO[Any]) -> str:
    presentation = pptx.Presentation(file)
    text_content = []
    for slide_number, slide in enumerate(presentation.slides, start=1):
-        extracted_text = f"\nSlide {slide_number}:\n"
+        slide_text = f"\nSlide {slide_number}:\n"
        for shape in slide.shapes:
            if hasattr(shape, "text"):
-                extracted_text += shape.text + "\n"
-        text_content.append(extracted_text)
+                slide_text += shape.text + "\n"
+        text_content.append(slide_text)
    return TEXT_SECTION_SEPARATOR.join(text_content)


@@ -305,18 +329,21 @@ def xlsx_to_text(file: IO[Any]) -> str:
    workbook = openpyxl.load_workbook(file, read_only=True)
    text_content = []
    for sheet in workbook.worksheets:
-        sheet_string = "\n".join(
-            ",".join(map(str, row))
-            for row in sheet.iter_rows(min_row=1, values_only=True)
-        )
-        text_content.append(sheet_string)
+        rows = []
+        for row in sheet.iter_rows(min_row=1, values_only=True):
+            row_str = ",".join(str(cell) if cell is not None else "" for cell in row)
+            rows.append(row_str)
+        sheet_str = "\n".join(rows)
+        text_content.append(sheet_str)
    return TEXT_SECTION_SEPARATOR.join(text_content)


 def eml_to_text(file: IO[Any]) -> str:
-    text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
+    encoding = detect_encoding(file)
+    text_file = io.TextIOWrapper(file, encoding=encoding)
    parser = EmailParser()
    message = parser.parse(text_file)
+
    text_content = []
    for part in message.walk():
        if part.get_content_type().startswith("text/plain"):
@@ -342,8 +369,8 @@ def epub_to_text(file: IO[Any]) -> str:

 def file_io_to_text(file: IO[Any]) -> str:
    encoding = detect_encoding(file)
-    file_content_raw, _ = read_text_file(file, encoding=encoding)
-    return file_content_raw
+    file_content, _ = read_text_file(file, encoding=encoding)
+    return file_content


 def extract_file_text(
@@ -352,9 +379,13 @@ def extract_file_text(
    break_on_unprocessable: bool = True,
    extension: str | None = None,
 ) -> str:
+    """
+    Legacy function that returns *only text*, ignoring embedded images.
+    For backward-compatibility in code that only wants text.
+    """
    extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
        ".pdf": pdf_to_text,
-        ".docx": docx_to_text,
+        ".docx": lambda f: docx_to_text_and_images(f)[0],  # no images
        ".pptx": pptx_to_text,
        ".xlsx": xlsx_to_text,
        ".eml": eml_to_text,
@@ -368,24 +399,23 @@ def extract_file_text(
                return unstructured_to_text(file, file_name)
            except Exception as unstructured_error:
                logger.error(
-                    f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
+                    f"Failed to process with Unstructured: {str(unstructured_error)}. "
+                    "Falling back to normal processing."
                )
-                # Fall through to normal processing
-        final_extension: str
-        if file_name or extension:
-            if extension is not None:
-                final_extension = extension
-            elif file_name is not None:
-                final_extension = get_file_ext(file_name)
+        if extension is None:
+            extension = get_file_ext(file_name)

-            if is_valid_file_ext(final_extension):
-                return extension_to_function.get(final_extension, file_io_to_text)(file)
+        if is_valid_file_ext(extension):
+            func = extension_to_function.get(extension, file_io_to_text)
+            file.seek(0)
+            return func(file)

-        # Either the file somehow has no name or the extension is not one that we recognize
+        # If unknown extension, maybe it's a text file
+        file.seek(0)
        if is_text_file(file):
            return file_io_to_text(file)

-        raise ValueError("Unknown file extension and unknown text encoding")
+        raise ValueError("Unknown file extension or not recognized as text data")

    except Exception as e:
        if break_on_unprocessable:
@@ -396,20 +426,93 @@ def extract_file_text(
        return ""


+def extract_text_and_images(
+    file: IO[Any],
+    file_name: str,
+    pdf_pass: str | None = None,
+) -> Tuple[str, List[Tuple[bytes, str]]]:
+    """
+    Primary new function for the updated connector.
+    Returns (text_content, [(embedded_img_bytes, embedded_img_name), ...]).
+    """
+
+    try:
+        # Attempt unstructured if env var is set
+        if get_unstructured_api_key():
+            # If the user doesn't want embedded images, unstructured is fine
+            file.seek(0)
+            text_content = unstructured_to_text(file, file_name)
+            return (text_content, [])
+
+        extension = get_file_ext(file_name)
+
+        # docx example for embedded images
+        if extension == ".docx":
+            file.seek(0)
+            text_content, images = docx_to_text_and_images(file)
+            return (text_content, images)
+
+        # PDF example: we do not show complicated PDF image extraction here
+        # so we simply extract text for now and skip images.
+        if extension == ".pdf":
+            file.seek(0)
+            text_content, _, images = read_pdf_file(file, pdf_pass, extract_images=True)
+            return (text_content, images)
+
+        # For PPTX, XLSX, EML, etc., we do not show embedded image logic here.
+        # You can do something similar to docx if needed.
+        if extension == ".pptx":
+            file.seek(0)
+            return (pptx_to_text(file), [])
+
+        if extension == ".xlsx":
+            file.seek(0)
+            return (xlsx_to_text(file), [])
+
+        if extension == ".eml":
+            file.seek(0)
+            return (eml_to_text(file), [])
+
+        if extension == ".epub":
+            file.seek(0)
+            return (epub_to_text(file), [])
+
+        if extension == ".html":
+            file.seek(0)
+            return (parse_html_page_basic(file), [])
+
+        # If we reach here and it's a recognized text extension
+        if is_text_file_extension(file_name):
+            file.seek(0)
+            encoding = detect_encoding(file)
+            text_content_raw, _ = read_text_file(
+                file, encoding=encoding, ignore_onyx_metadata=False
+            )
+            return (text_content_raw, [])
+
+        # If it's an image file or something else, we do not parse embedded images from them
+        # just return empty text
+        file.seek(0)
+        return ("", [])
+
+    except Exception as e:
+        logger.exception(f"Failed to extract text/images from {file_name}: {e}")
+        return ("", [])
+
+
 def convert_docx_to_txt(
    file: UploadFile, file_store: FileStore, file_path: str
 ) -> None:
+    """
+    Helper to convert docx to a .txt file in the same filestore.
+    """
    file.file.seek(0)
    docx_content = file.file.read()
-    doc = Document(BytesIO(docx_content))
+    doc = DocxDocument(BytesIO(docx_content))

    # Extract text from the document
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
-
-    # Join the extracted text
-    text_content = "\n".join(full_text)
+    all_paras = [p.text for p in doc.paragraphs]
+    text_content = "\n".join(all_paras)

    txt_file_path = docx_to_txt_filename(file_path)
    file_store.save_file(
@@ -422,7 +525,4 @@ def convert_docx_to_txt(


 def docx_to_txt_filename(file_path: str) -> str:
-    """
-    Convert a .docx file path to its corresponding .txt file path.
-    """
    return file_path.rsplit(".", 1)[0] + ".txt"
--- a/backend/onyx/file_processing/file_validation.py
+++ b/backend/onyx/file_processing/file_validation.py
@@ -0,0 +1,46 @@
+"""
+Centralized file type validation utilities.
+"""
+# Standard image MIME types supported by most vision LLMs
+IMAGE_MIME_TYPES = [
+    "image/png",
+    "image/jpeg",
+    "image/jpg",
+    "image/webp",
+]
+
+# Image types that should be excluded from processing
+EXCLUDED_IMAGE_TYPES = [
+    "image/bmp",
+    "image/tiff",
+    "image/gif",
+    "image/svg+xml",
+]
+
+
+def is_valid_image_type(mime_type: str) -> bool:
+    """
+    Check if mime_type is a valid image type.
+
+    Args:
+        mime_type: The MIME type to check
+
+    Returns:
+        True if the MIME type is a valid image type, False otherwise
+    """
+    if not mime_type:
+        return False
+    return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
+
+
+def is_supported_by_vision_llm(mime_type: str) -> bool:
+    """
+    Check if this image type can be processed by vision LLMs.
+
+    Args:
+        mime_type: The MIME type to check
+
+    Returns:
+        True if the MIME type is supported by vision LLMs, False otherwise
+    """
+    return mime_type in IMAGE_MIME_TYPES
--- a/backend/onyx/file_processing/image_summarization.py
+++ b/backend/onyx/file_processing/image_summarization.py
@@ -0,0 +1,129 @@
+import base64
+from io import BytesIO
+
+from langchain_core.messages import BaseMessage
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import SystemMessage
+from PIL import Image
+
+from onyx.llm.interfaces import LLM
+from onyx.llm.utils import message_to_string
+from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_SYSTEM_PROMPT
+from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_USER_PROMPT
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def prepare_image_bytes(image_data: bytes) -> str:
+    """Prepare image bytes for summarization.
+    Resizes image if it's larger than 20MB. Encodes image as a base64 string."""
+    image_data = _resize_image_if_needed(image_data)
+
+    # encode image (base64)
+    encoded_image = _encode_image_for_llm_prompt(image_data)
+
+    return encoded_image
+
+
+def summarize_image_pipeline(
+    llm: LLM,
+    image_data: bytes,
+    query: str | None = None,
+    system_prompt: str | None = None,
+) -> str:
+    """Pipeline to generate a summary of an image.
+    Resizes images if it is bigger than 20MB. Encodes image as a base64 string.
+    And finally uses the Default LLM to generate a textual summary of the image."""
+    # resize image if it's bigger than 20MB
+    encoded_image = prepare_image_bytes(image_data)
+
+    summary = _summarize_image(
+        encoded_image,
+        llm,
+        query,
+        system_prompt,
+    )
+
+    return summary
+
+
+def summarize_image_with_error_handling(
+    llm: LLM | None,
+    image_data: bytes,
+    context_name: str,
+    system_prompt: str = IMAGE_SUMMARIZATION_SYSTEM_PROMPT,
+    user_prompt_template: str = IMAGE_SUMMARIZATION_USER_PROMPT,
+) -> str | None:
+    """Wrapper function that handles error cases and configuration consistently.
+
+    Args:
+        llm: The LLM with vision capabilities to use for summarization
+        image_data: The raw image bytes
+        context_name: Name or title of the image for context
+        system_prompt: System prompt to use for the LLM
+        user_prompt_template: Template for the user prompt, should contain {title} placeholder
+
+    Returns:
+        The image summary text, or None if summarization failed or is disabled
+    """
+    if llm is None:
+        return None
+
+    user_prompt = user_prompt_template.format(title=context_name)
+    return summarize_image_pipeline(llm, image_data, user_prompt, system_prompt)
+
+
+def _summarize_image(
+    encoded_image: str,
+    llm: LLM,
+    query: str | None = None,
+    system_prompt: str | None = None,
+) -> str:
+    """Use default LLM (if it is multimodal) to generate a summary of an image."""
+
+    messages: list[BaseMessage] = []
+
+    if system_prompt:
+        messages.append(SystemMessage(content=system_prompt))
+
+    messages.append(
+        HumanMessage(
+            content=[
+                {"type": "text", "text": query},
+                {"type": "image_url", "image_url": {"url": encoded_image}},
+            ],
+        ),
+    )
+
+    try:
+        return message_to_string(llm.invoke(messages))
+
+    except Exception as e:
+        raise ValueError(f"Summarization failed. Messages: {messages}") from e
+
+
+def _encode_image_for_llm_prompt(image_data: bytes) -> str:
+    """Getting the base64 string."""
+    base64_encoded_data = base64.b64encode(image_data).decode("utf-8")
+
+    return f"data:image/jpeg;base64,{base64_encoded_data}"
+
+
+def _resize_image_if_needed(image_data: bytes, max_size_mb: int = 20) -> bytes:
+    """Resize image if it's larger than the specified max size in MB."""
+    max_size_bytes = max_size_mb * 1024 * 1024
+
+    if len(image_data) > max_size_bytes:
+        with Image.open(BytesIO(image_data)) as img:
+            # Reduce dimensions for better size reduction
+            img.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
+            output = BytesIO()
+
+            # Save with lower quality for compression
+            img.save(output, format="JPEG", quality=85)
+            resized_data = output.getvalue()
+
+            return resized_data
+
+    return image_data
--- a/backend/onyx/file_processing/image_utils.py
+++ b/backend/onyx/file_processing/image_utils.py
@@ -0,0 +1,70 @@
+from typing import Tuple
+
+from sqlalchemy.orm import Session
+
+from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
+from onyx.configs.constants import FileOrigin
+from onyx.connectors.models import Section
+from onyx.db.pg_file_store import save_bytes_to_pgfilestore
+from onyx.file_processing.image_summarization import summarize_image_with_error_handling
+from onyx.llm.interfaces import LLM
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def store_image_and_create_section(
+    db_session: Session,
+    image_data: bytes,
+    file_name: str,
+    display_name: str,
+    media_type: str = "image/unknown",
+    llm: LLM | None = None,
+    file_origin: FileOrigin = FileOrigin.OTHER,
+) -> Tuple[Section, str | None]:
+    """
+    Stores an image in PGFileStore and creates a Section object with optional summarization.
+
+    Args:
+        db_session: Database session
+        image_data: Raw image bytes
+        file_name: Base identifier for the file
+        display_name: Human-readable name for the image
+        media_type: MIME type of the image
+        llm: Optional LLM with vision capabilities for summarization
+        file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
+
+    Returns:
+        Tuple containing:
+        - Section object with image reference and optional summary text
+        - The file_name in PGFileStore or None if storage failed
+    """
+    # Storage logic
+    stored_file_name = None
+    try:
+        pgfilestore = save_bytes_to_pgfilestore(
+            db_session=db_session,
+            raw_bytes=image_data,
+            media_type=media_type,
+            identifier=file_name,
+            display_name=display_name,
+            file_origin=file_origin,
+        )
+        stored_file_name = pgfilestore.file_name
+    except Exception as e:
+        logger.error(f"Failed to store image: {e}")
+        if not CONTINUE_ON_CONNECTOR_FAILURE:
+            raise
+        return Section(text=""), None
+
+    # Summarization logic
+    summary_text = ""
+    if llm:
+        summary_text = (
+            summarize_image_with_error_handling(llm, image_data, display_name) or ""
+        )
+
+    return (
+        Section(text=summary_text, image_file_name=stored_file_name),
+        stored_file_name,
+    )