diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 1ba079838d..2821ad119e 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -915,6 +915,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor { func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor { if len(shape) != 4 { panic("expected 4 dimensions") + } else if shape[3] != 0 { + panic("cuda does not support 4d tensors") } return &Tensor{ diff --git a/model/models/mllama/model.go b/model/models/mllama/model.go index 93b443ef1e..4d5bdd4a18 100644 --- a/model/models/mllama/model.go +++ b/model/models/mllama/model.go @@ -3,6 +3,7 @@ package mllama import ( "bytes" "image" + "slices" "github.com/ollama/ollama/fs" "github.com/ollama/ollama/kvcache" @@ -73,13 +74,17 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er return nil, err } - pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles()) + if ratio.numTiles() < m.maxNumTiles { + // Pad tiles to maxNumTiles + f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles) + f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles] + } + + pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles) if err != nil { return nil, err } - pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles()) - aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1) if err != nil { return nil, err