From 6f34126dcc67ad8a1419f97c46b3b75fee506b01 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 20 Mar 2025 15:15:04 -0700 Subject: [PATCH] image processing --- .../models/{pixtral => mistral3}/imageproc.go | 2 +- .../{pixtral => mistral3}/imageproc_test.go | 2 +- model/models/mistral3/model.go | 94 +++++++++++++++---- 3 files changed, 76 insertions(+), 22 deletions(-) rename model/models/{pixtral => mistral3}/imageproc.go (98%) rename model/models/{pixtral => mistral3}/imageproc_test.go (99%) diff --git a/model/models/pixtral/imageproc.go b/model/models/mistral3/imageproc.go similarity index 98% rename from model/models/pixtral/imageproc.go rename to model/models/mistral3/imageproc.go index 16ec0c410..78c1ddf7c 100644 --- a/model/models/pixtral/imageproc.go +++ b/model/models/mistral3/imageproc.go @@ -1,4 +1,4 @@ -package pixtral +package mistral3 import ( "fmt" diff --git a/model/models/pixtral/imageproc_test.go b/model/models/mistral3/imageproc_test.go similarity index 99% rename from model/models/pixtral/imageproc_test.go rename to model/models/mistral3/imageproc_test.go index 1d9e4ffe5..2ec634132 100644 --- a/model/models/pixtral/imageproc_test.go +++ b/model/models/mistral3/imageproc_test.go @@ -1,4 +1,4 @@ -package pixtral +package mistral3 import ( "bytes" diff --git a/model/models/mistral3/model.go b/model/models/mistral3/model.go index e8f7d2a6e..e22c2c95d 100644 --- a/model/models/mistral3/model.go +++ b/model/models/mistral3/model.go @@ -1,9 +1,14 @@ package mistral3 import ( + "image" + _ "image/jpeg" + _ "image/png" + "github.com/ollama/ollama/kvcache" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" + "github.com/ollama/ollama/model/imageproc" "github.com/ollama/ollama/model/input" ) @@ -11,14 +16,46 @@ type Model struct { model.Base *TextModel + ImageProcessor + // TODO: Add VisionModel field // *VisionModel `gguf:"v,vision"` // TODO: Add MultiModalProjector field for combining vision and text features // *MultiModalProjector `gguf:"mm"` +} - // TODO: Add ImageProcessor field - // ImageProcessor +// Adding ImageProcessor struct +type ImageProcessor struct { + imageSize int + patchSize int + numChannels int + longestEdge int +} + +// Function to create a new ImageProcessor +func newImageProcessor(c ml.Config) ImageProcessor { + return ImageProcessor{ + imageSize: int(c.Uint("vision.image_size", 1024)), + patchSize: int(c.Uint("vision.patch_size", 16)), + numChannels: int(c.Uint("vision.num_channels", 3)), + longestEdge: int(c.Uint("vision.longest_edge", 1024)), + } +} + +// Method to process images for the model +func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) { + // Get output size based on longest edge and patch size + outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize}) + + // Resize the image + newImage := imageproc.Composite(img) + newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear) + + // Normalize image data + data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true) + + return data, nil } // TODO: Implement MultimodalProcessor interface @@ -32,12 +69,12 @@ func New(c ml.Config) (model.Model, error) { m := &Model{ TextModel: textModel, + // Initialize the ImageProcessor + ImageProcessor: newImageProcessor(c), + // TODO: Initialize VisionModel if present // VisionModel: newVisionModel(c), - // TODO: Initialize ImageProcessor - // ImageProcessor: newImageProcessor(c), - // TODO: Initialize MultiModalProjector // MultiModalProjector: &MultiModalProjector{...}, } @@ -47,21 +84,38 @@ func New(c ml.Config) (model.Model, error) { return m, nil } -// TODO: Implement EncodeMultimodal method for processing images -// func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { -// // Check if vision model is available -// // Decode image -// // Process the image -// // Pass through vision model -// // Project vision outputs to text embedding space -// // Return vision embeddings -// } +// Implement EncodeMultimodal method for processing images +func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) { + // Check if vision model exists - return error for now + return nil, model.ErrNoVisionModel -// TODO: Implement PostTokenize method to handle vision tokens -// func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) { -// // Add special tokens around image data -// // Insert placeholders for image tokens -// } + // This will be implemented when adding the vision model: + /* + image, _, err := image.Decode(bytes.NewReader(multimodalData)) + if err != nil { + return nil, err + } + + f32s, err := m.ImageProcessor.ProcessImage(image) + if err != nil { + return nil, err + } + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, + m.ImageProcessor.imageSize, + m.ImageProcessor.imageSize, + m.ImageProcessor.numChannels, + ) + if err != nil { + return nil, err + } + + // Will need VisionModel to process this + // visionOutputs := m.VisionModel.Forward(ctx, pixelValues) + // visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs) + // return visionOutputs, nil + */ +} func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) { inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs)) @@ -79,7 +133,7 @@ func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) { return nil, err } - // TODO: Add handling of multimodal inputs + // TODO: Add handling of multimodal inputs when vision model is added // Set image embeddings into hidden state if present in opts.Multimodal return m.TextModel.Forward(ctx, inputs, positions, outputs, opts, m.Cache), nil