From ef202789fad6b8a8ab51f4d2ff5450067e3d1f65 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 15 May 2025 13:44:44 -0700 Subject: [PATCH] fix pixel values padding (#10718) * panic if trying to pad 4d * fix pixel values padding --- ml/backend/ggml/ggml.go | 2 ++ model/models/mllama/model.go | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 1ba079838d..2821ad119e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -915,6 +915,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor { func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { if len(shape) != 4 { panic("expected 4 dimensions") + } else if shape[3] != 0 { + panic("cuda does not support 4d tensors") } return &Tensor{ diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 93b443ef1e..4d5bdd4a18 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -3,6 +3,7 @@ package mllama import ( "bytes" "image" + "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -73,13 +74,17 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er return nil, err } - pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles()) + if ratio.numTiles() < m.maxNumTiles { + // Pad tiles to maxNumTiles + f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles) + f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles] + } + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles) if err != nil { return nil, err } - pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles()) - aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1) if err != nil { return nil, err