diff --git a/model/models/gemma3/model_text.go b/model/models/gemma3/model_text.go index 765fb111f..f1644e215 100644 --- a/model/models/gemma3/model_text.go +++ b/model/models/gemma3/model_text.go @@ -180,7 +180,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor if multimodal != nil { visionOutputs := multimodal[0].Multimodal.(ml.Tensor) offset := multimodal[0].Index - 1 - visionOutputs.Dim(1) - hiddenState = hiddenState.Set(ctx, visionOutputs, offset*hiddenState.Stride(0)) + hiddenState = hiddenState.Set(ctx, visionOutputs, offset*hiddenState.Stride(1)) } for i, layer := range m.Layers { diff --git a/model/models/gemma3/process_image.go b/model/models/gemma3/process_image.go index 961794044..fe8269a3b 100644 --- a/model/models/gemma3/process_image.go +++ b/model/models/gemma3/process_image.go @@ -20,11 +20,11 @@ func newImageProcessor(c ml.Config) ImageProcessor { } func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 { - var pixelVals []float32 + var pixelVals, rVals, gVals, bVals []float32 bounds := img.Bounds() - for x := bounds.Min.X; x < bounds.Max.X; x++ { - for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for y := bounds.Min.Y; y < bounds.Max.Y; y++ { + for x := bounds.Min.X; x < bounds.Max.X; x++ { c := img.At(x, y) r, g, b, _ := c.RGBA() rVal := float32(r>>8) / 255.0 @@ -35,10 +35,16 @@ func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 { gVal = (gVal - mean[1]) / std[1] bVal = (bVal - mean[2]) / std[2] - pixelVals = append(pixelVals, rVal, gVal, bVal) + rVals = append(rVals, rVal) + gVals = append(gVals, gVal) + bVals = append(bVals, bVal) } } + pixelVals = append(pixelVals, rVals...) + pixelVals = append(pixelVals, gVals...) + pixelVals = append(pixelVals, bVals...) + return pixelVals }