From 6f34126dcc67ad8a1419f97c46b3b75fee506b01 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Thu, 20 Mar 2025 15:15:04 -0700
Subject: [PATCH] image processing

---
 .../models/{pixtral => mistral3}/imageproc.go |  2 +-
 .../{pixtral => mistral3}/imageproc_test.go   |  2 +-
 model/models/mistral3/model.go                | 94 +++++++++++++++----
 3 files changed, 76 insertions(+), 22 deletions(-)
 rename model/models/{pixtral => mistral3}/imageproc.go (98%)
 rename model/models/{pixtral => mistral3}/imageproc_test.go (99%)

diff --git a/model/models/pixtral/imageproc.go b/model/models/mistral3/imageproc.go
similarity index 98%
rename from model/models/pixtral/imageproc.go
rename to model/models/mistral3/imageproc.go
index 16ec0c410..78c1ddf7c 100644
--- a/model/models/pixtral/imageproc.go
+++ b/model/models/mistral3/imageproc.go
@@ -1,4 +1,4 @@
-package pixtral
+package mistral3
 
 import (
 	"fmt"
diff --git a/model/models/pixtral/imageproc_test.go b/model/models/mistral3/imageproc_test.go
similarity index 99%
rename from model/models/pixtral/imageproc_test.go
rename to model/models/mistral3/imageproc_test.go
index 1d9e4ffe5..2ec634132 100644
--- a/model/models/pixtral/imageproc_test.go
+++ b/model/models/mistral3/imageproc_test.go
@@ -1,4 +1,4 @@
-package pixtral
+package mistral3
 
 import (
 	"bytes"
diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go
index e8f7d2a6e..e22c2c95d 100644
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -1,9 +1,14 @@
 package mistral3
 
 import (
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/imageproc"
 	"github.com/ollama/ollama/model/input"
 )
 
@@ -11,14 +16,46 @@ type Model struct {
 	model.Base
 	*TextModel
 
+	ImageProcessor
+
 	// TODO: Add VisionModel field
 	// *VisionModel `gguf:"v,vision"`
 
 	// TODO: Add MultiModalProjector field for combining vision and text features
 	// *MultiModalProjector `gguf:"mm"`
+}
 
-	// TODO: Add ImageProcessor field
-	// ImageProcessor
+// Adding ImageProcessor struct
+type ImageProcessor struct {
+	imageSize   int
+	patchSize   int
+	numChannels int
+	longestEdge int
+}
+
+// Function to create a new ImageProcessor
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size", 1024)),
+		patchSize:   int(c.Uint("vision.patch_size", 16)),
+		numChannels: int(c.Uint("vision.num_channels", 3)),
+		longestEdge: int(c.Uint("vision.longest_edge", 1024)),
+	}
+}
+
+// Method to process images for the model
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
+	// Get output size based on longest edge and patch size
+	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
+
+	// Resize the image
+	newImage := imageproc.Composite(img)
+	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
+
+	// Normalize image data
+	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+
+	return data, nil
 }
 
 // TODO: Implement MultimodalProcessor interface
@@ -32,12 +69,12 @@ func New(c ml.Config) (model.Model, error) {
 
 	m := &Model{
 		TextModel: textModel,
+		// Initialize the ImageProcessor
+		ImageProcessor: newImageProcessor(c),
+
 		// TODO: Initialize VisionModel if present
 		// VisionModel: newVisionModel(c),
 
-		// TODO: Initialize ImageProcessor
-		// ImageProcessor: newImageProcessor(c),
-
 		// TODO: Initialize MultiModalProjector
 		// MultiModalProjector: &MultiModalProjector{...},
 	}
@@ -47,21 +84,38 @@ func New(c ml.Config) (model.Model, error) {
 	return m, nil
 }
 
-// TODO: Implement EncodeMultimodal method for processing images
-// func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
-//     // Check if vision model is available
-//     // Decode image
-//     // Process the image
-//     // Pass through vision model
-//     // Project vision outputs to text embedding space
-//     // Return vision embeddings
-// }
+// Implement EncodeMultimodal method for processing images
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	// Check if vision model exists - return error for now
+	return nil, model.ErrNoVisionModel
 
-// TODO: Implement PostTokenize method to handle vision tokens
-// func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-//     // Add special tokens around image data
-//     // Insert placeholders for image tokens
-// }
+	// This will be implemented when adding the vision model:
+	/*
+		image, _, err := image.Decode(bytes.NewReader(multimodalData))
+		if err != nil {
+			return nil, err
+		}
+
+		f32s, err := m.ImageProcessor.ProcessImage(image)
+		if err != nil {
+			return nil, err
+		}
+
+		pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.numChannels,
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		// Will need VisionModel to process this
+		// visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+		// visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs)
+		// return visionOutputs, nil
+	*/
+}
 
 func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
@@ -79,7 +133,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}
 
-	// TODO: Add handling of multimodal inputs
+	// TODO: Add handling of multimodal inputs when vision model is added
 	// Set image embeddings into hidden state if present in opts.Multimodal
 
 	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil