image processing

2025-04-16 07:31:35 +02:00 · 2025-03-20 15:15:04 -07:00 · 2025-03-20 15:15:04 -07:00 · 6f34126dcc
commit 6f34126dcc
parent ecc0ef468f
3 changed files with 76 additions and 22 deletions
--- a/model/models/mistral3/imageproc.go
+++ b/model/models/mistral3/imageproc.go
@ -1,4 +1,4 @@
-package pixtral
+package mistral3

 import (
 	"fmt"
--- a/model/models/mistral3/imageproc_test.go
+++ b/model/models/mistral3/imageproc_test.go
@ -1,4 +1,4 @@
-package pixtral
+package mistral3

 import (
 	"bytes"
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@ -1,9 +1,14 @@
 package mistral3

 import (
+	"image"
+	_ "image/jpeg"
+	_ "image/png"
+
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/imageproc"
 	"github.com/ollama/ollama/model/input"
 )

@ -11,14 +16,46 @@ type Model struct {
 	model.Base
 	*TextModel

+	ImageProcessor
+
 	// TODO: Add VisionModel field
 	// *VisionModel `gguf:"v,vision"`

 	// TODO: Add MultiModalProjector field for combining vision and text features
 	// *MultiModalProjector `gguf:"mm"`
+}

-	// TODO: Add ImageProcessor field
-	// ImageProcessor
+// Adding ImageProcessor struct
+type ImageProcessor struct {
+	imageSize   int
+	patchSize   int
+	numChannels int
+	longestEdge int
+}
+
+// Function to create a new ImageProcessor
+func newImageProcessor(c ml.Config) ImageProcessor {
+	return ImageProcessor{
+		imageSize:   int(c.Uint("vision.image_size", 1024)),
+		patchSize:   int(c.Uint("vision.patch_size", 16)),
+		numChannels: int(c.Uint("vision.num_channels", 3)),
+		longestEdge: int(c.Uint("vision.longest_edge", 1024)),
+	}
+}
+
+// Method to process images for the model
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
+	// Get output size based on longest edge and patch size
+	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
+
+	// Resize the image
+	newImage := imageproc.Composite(img)
+	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
+
+	// Normalize image data
+	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+
+	return data, nil
 }

 // TODO: Implement MultimodalProcessor interface
@ -32,12 +69,12 @@ func New(c ml.Config) (model.Model, error) {

 	m := &Model{
 		TextModel: textModel,
+		// Initialize the ImageProcessor
+		ImageProcessor: newImageProcessor(c),
+
 		// TODO: Initialize VisionModel if present
 		// VisionModel: newVisionModel(c),

-		// TODO: Initialize ImageProcessor
-		// ImageProcessor: newImageProcessor(c),
-
 		// TODO: Initialize MultiModalProjector
 		// MultiModalProjector: &MultiModalProjector{...},
 	}
@ -47,21 +84,38 @@ func New(c ml.Config) (model.Model, error) {
 	return m, nil
 }

-// TODO: Implement EncodeMultimodal method for processing images
-// func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
-//     // Check if vision model is available
-//     // Decode image
-//     // Process the image
-//     // Pass through vision model
-//     // Project vision outputs to text embedding space
-//     // Return vision embeddings
-// }
+// Implement EncodeMultimodal method for processing images
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+	// Check if vision model exists - return error for now
+	return nil, model.ErrNoVisionModel

-// TODO: Implement PostTokenize method to handle vision tokens
-// func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
-//     // Add special tokens around image data
-//     // Insert placeholders for image tokens
-// }
+	// This will be implemented when adding the vision model:
+	/*
+		image, _, err := image.Decode(bytes.NewReader(multimodalData))
+		if err != nil {
+			return nil, err
+		}
+
+		f32s, err := m.ImageProcessor.ProcessImage(image)
+		if err != nil {
+			return nil, err
+		}
+
+		pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.imageSize,
+			m.ImageProcessor.numChannels,
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		// Will need VisionModel to process this
+		// visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+		// visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs)
+		// return visionOutputs, nil
+	*/
+}

 func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
@ -79,7 +133,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
 		return nil, err
 	}

-	// TODO: Add handling of multimodal inputs
+	// TODO: Add handling of multimodal inputs when vision model is added
 	// Set image embeddings into hidden state if present in opts.Multimodal

 	return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil