mirror of
https://github.com/ollama/ollama.git
synced 2025-08-31 01:01:17 +02:00
fix pixel values padding (#10718)
* panic if trying to pad 4d * fix pixel values padding
This commit is contained in:
@@ -915,6 +915,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
|
||||
func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
|
||||
if len(shape) != 4 {
|
||||
panic("expected 4 dimensions")
|
||||
} else if shape[3] != 0 {
|
||||
panic("cuda does not support 4d tensors")
|
||||
}
|
||||
|
||||
return &Tensor{
|
||||
|
@@ -3,6 +3,7 @@ package mllama
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
@@ -73,13 +74,17 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
|
||||
if ratio.numTiles() < m.maxNumTiles {
|
||||
// Pad tiles to maxNumTiles
|
||||
f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
|
||||
f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
|
||||
}
|
||||
|
||||
pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
|
||||
|
||||
aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
Reference in New Issue
Block a user