wip

2025-04-12 13:49:43 +02:00 · 2025-03-23 21:41:18 -07:00 · 2025-03-23 21:41:18 -07:00 · 4586e137fe
commit 4586e137fe
parent cfeca27133
9 changed files with 89 additions and 688 deletions
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@ -467,6 +467,14 @@ func (t *testTensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, di
 	panic("not implemented")
 }

+func (t *testTensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int, ropeType uint32, base, scale float32) ml.Tensor {
+	panic("not implemented")
+}
+
+func (t *testTensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+	panic("not implemented")
+}
+
 func (t *testTensor) Tanh(ctx ml.Context) ml.Tensor {
 	panic("not implemented")
 }
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@ -2186,10 +2186,6 @@ static void ggml_metal_encode_node(
            } break;
        case GGML_OP_MUL_MAT:
            {
-                if (ne00 != ne10) {
-                    printf("mul_mat, ne00: %d, ne01: %d, ne02: %d, ne03: %d, ne10: %d, ne11: %d, ne12: %d, ne13: %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13);
-                }
-
                GGML_ASSERT(ne00 == ne10);

                GGML_ASSERT(ne12 % ne02 == 0);
--- a/model/models/mistral3/imageproc.go
+++ b/model/models/mistral3/imageproc.go
@ -1,72 +1,15 @@
 package mistral3

 import (
-	"fmt"
 	"image"
 	_ "image/jpeg"
 	_ "image/png"
-	"io"
 	"math"

 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/model/imageproc"
 )

-func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
-	return image.Point{
-		(imageSize.X-1)/patchSize.X + 1,
-		(imageSize.Y-1)/patchSize.Y + 1,
-	}
-}
-
-func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
-	b := img.Bounds()
-	ratio := math.Max(float64(b.Max.Y)/float64(longestEdge), float64(b.Max.X)/float64(longestEdge))
-
-	newSize := img.Bounds().Max
-
-	if ratio > 1.0 {
-		newSize = image.Point{
-			int(math.Floor(float64(b.Max.X) / ratio)),
-			int(math.Floor(float64(b.Max.Y) / ratio)),
-		}
-	}
-
-	tokens := getNumImageTokens(newSize, patchSize)
-	return image.Point{
-		tokens.X * patchSize.X,
-		tokens.Y * patchSize.Y,
-	}
-}
-
-func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
-	if format == "png" {
-		img = imageproc.Composite(img)
-	}
-
-	newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
-
-	// todo should be ResizeBicubic, but it doesn't exist
-	return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
-}
-
-func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
-	img, format, err := image.Decode(imageData)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to decode image: %w", err)
-	}
-
-	longestEdge := 1024
-	patchSize := image.Point{16, 16}
-
-	img = resizeImage(img, format, longestEdge, patchSize)
-
-	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-
-	opts := map[string]any{}
-	return data, opts, nil
-}
-
 type ImageProcessor struct {
 	imageSize   int
 	patchSize   int
@ -83,10 +26,31 @@ func newImageProcessor(c ml.Config) ImageProcessor {
 	}
 }

-func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
-	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
-	newImage := imageproc.Composite(img)
-	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
-	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-	return data, nil
+// ProcessImage prepares an image for the vision model by:
+// 1. Compositing transparent images
+// 2. Resizing to fit model constraints while preserving aspect ratio
+// 3. Normalizing pixel values
+// Returns normalized image data and the final size in pixels
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, image.Point, error) {
+	img = imageproc.Composite(img)
+
+	size := img.Bounds().Size()
+	ratio := max(float64(size.Y)/float64(p.longestEdge), float64(size.X)/float64(p.longestEdge))
+	if ratio > 1.0 {
+		size = image.Point{
+			int(math.Floor(float64(size.X) / ratio)),
+			int(math.Floor(float64(size.Y) / ratio)),
+		}
+	}
+
+	patchesX := (size.X-1)/p.patchSize + 1
+	patchesY := (size.Y-1)/p.patchSize + 1
+	size = image.Point{
+		patchesX * p.patchSize,
+		patchesY * p.patchSize,
+	}
+
+	img = imageproc.Resize(img, size, imageproc.ResizeBilinear)
+	data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
+	return data, size, nil
 }
--- a/model/models/mistral3/imageproc_test.go
+++ b/model/models/mistral3/imageproc_test.go
@ -1,219 +0,0 @@
-package mistral3
-
-import (
-	"bytes"
-	"encoding/binary"
-	"image"
-	"image/png"
-	"math"
-	"os"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func TestGetNumImageTokens(t *testing.T) {
-	type numImageTokensCase struct {
-		ImageSize image.Point
-		PatchSize image.Point
-		Expected  image.Point
-	}
-
-	cases := []numImageTokensCase{
-		{
-			ImageSize: image.Point{1024, 764},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{64, 48},
-		},
-		{
-			ImageSize: image.Point{800, 600},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{50, 38},
-		},
-		{
-			ImageSize: image.Point{640, 480},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{40, 30},
-		},
-		{
-			ImageSize: image.Point{320, 200},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{20, 13},
-		},
-		{
-			ImageSize: image.Point{1320, 200},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{83, 13},
-		},
-		{
-			ImageSize: image.Point{2000, 200},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{125, 13},
-		},
-		{
-			ImageSize: image.Point{10000, 200},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{625, 13},
-		},
-		{
-			ImageSize: image.Point{1131, 577},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{71, 37},
-		},
-		{
-			ImageSize: image.Point{16, 16},
-			PatchSize: image.Point{16, 16},
-			Expected:  image.Point{1, 1},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getNumImageTokens(c.ImageSize, c.PatchSize)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestGetResizeOutputImageSize(t *testing.T) {
-	type resizeCase struct {
-		Image       image.Image
-		LongestEdge int
-		PatchSize   image.Point
-		Expected    image.Point
-	}
-
-	cases := []resizeCase{
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 1024, 768)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.Point{1024, 768},
-		},
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 1162, 690)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.Point{1024, 624},
-		},
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 300, 200)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.Point{304, 208},
-		},
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 1862, 522)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.Point{1024, 288},
-		},
-	}
-
-	for _, c := range cases {
-		actual := getResizeOutputImageSize(c.Image, c.LongestEdge, c.PatchSize)
-
-		if diff := cmp.Diff(actual, c.Expected); diff != "" {
-			t.Errorf("mismatch (-got +want):\n%s", diff)
-		}
-	}
-}
-
-func TestResize(t *testing.T) {
-	type resizeCase struct {
-		Image       image.Image
-		LongestEdge int
-		PatchSize   image.Point
-		Expected    image.Image
-	}
-
-	cases := []resizeCase{
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 1862, 522)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 1024, 288)),
-		},
-		{
-			Image:       image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			LongestEdge: 1024,
-			PatchSize:   image.Point{16, 16},
-			Expected:    image.NewRGBA(image.Rect(0, 0, 16, 16)),
-		},
-	}
-
-	for _, c := range cases {
-		actual := resizeImage(c.Image, "png", c.LongestEdge, c.PatchSize)
-
-		if actual.Bounds() != c.Expected.Bounds() {
-			t.Errorf("image size incorrect: '%#v': expected: '%#v'", actual.Bounds(), c.Expected.Bounds())
-		}
-	}
-}
-
-func TestPreprocess(t *testing.T) {
-	type preprocessCase struct {
-		TestImage   image.Image
-		ExpectedLen int
-	}
-
-	cases := []preprocessCase{
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 10, 10)),
-			ExpectedLen: 16 * 16 * 3 * 1,
-		},
-		{
-			TestImage:   image.NewRGBA(image.Rect(0, 0, 2000, 2000)),
-			ExpectedLen: 1024 * 1024 * 3 * 1,
-		},
-	}
-
-	for _, c := range cases {
-		var buf bytes.Buffer
-		err := png.Encode(&buf, c.TestImage)
-		if err != nil {
-			t.Fatal(err)
-		}
-
-		imgData, _, err := Preprocess(&buf)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		switch len(imgData) {
-		case 0:
-			t.Errorf("no image data returned")
-		case c.ExpectedLen:
-			// ok
-		default:
-			t.Errorf("unexpected image data length: %d, expected: %d", len(imgData), c.ExpectedLen)
-		}
-	}
-}
-
-func TestPreprocessImages(t *testing.T) {
-	for _, testFile := range []string{"flight.png", "sportsball.png"} {
-		f, err := os.Open(testFile)
-		if err != nil {
-			t.Skipf("skipping test, no test image found at %s", testFile)
-		}
-		defer f.Close()
-
-		imgData, _, err := Preprocess(f)
-		if err != nil {
-			t.Fatalf("error processing: %q", err)
-		}
-
-		byteData := make([]byte, len(imgData)*4) // float32 is 4 bytes
-		for i, f := range imgData {
-			binary.LittleEndian.PutUint32(byteData[i*4:], math.Float32bits(f))
-		}
-
-		outputPath := "processed_" + testFile + ".bin"
-		err = os.WriteFile(outputPath, byteData, 0o644)
-		if err != nil {
-			t.Fatalf("error writing processed image: %q", err)
-		}
-	}
-}
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@ -51,36 +51,34 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	f32s, err := m.ImageProcessor.ProcessImage(image)
+	f32s, size, err := m.ImageProcessor.ProcessImage(image)
 	if err != nil {
 		return nil, err
 	}

 	// Create tensor from image data
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
-		m.ImageProcessor.imageSize,
-		1036, // TODO (jmorganca): this should be returned from ProcessImage
-		m.ImageProcessor.numChannels,
-	)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
 	if err != nil {
 		return nil, err
 	}

-	// fmt.Println("pixelValues", "shape", pixelValues.Shape(), "data", ml.Dump(ctx, pixelValues))
-
-	// Forward pass through vision model
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
+	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

-	// fmt.Println("visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+	// split into patches to be sent to the text transformer
+	var rows []ml.Tensor
+	for i := 0; i < size.Y; i++ {
+		view := features.View(ctx, features.Dim(0)*i, features.Dim(0), features.Dim(0)*4, size.X)
+		rows = append(rows, view)
+	}

-	// Project to text embedding space
-	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.VisionModel.eps)
-
-	// fmt.Println("visionOutputs after projector", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
-
-	return visionOutputs, nil
+	return rows, nil
 }

+// PostTokenize arranges Mistral 3's inputs for the forward pass
+// In Mistral 3 and Pixtral, the input patches are arranged as follows:
+// [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
+// Each sequence of [IMG]...[IMG] is a single patch or "row" of vision embeddings
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

@ -88,13 +86,16 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
-
-			// Add special image tokens - using the imageTokenIndex from config
-			result = append(result, input.Input{Token: 10})                                                       // [IMG]
-			result = append(result, input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}) // image data
-			result = append(result, slices.Repeat([]input.Input{{Token: 10}}, inputMultimodal.Dim(1)-1)...)       // [IMG] placeholders
-			result = append(result, input.Input{Token: 13})                                                       // [IMG_END]
+			inputMultimodal := inp.Multimodal.([]ml.Tensor)
+			for i, row := range inputMultimodal {
+				result = append(result, input.Input{Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.Dim(1)}) // Image data
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Dim(1))...)                                // [IMG]
+				if i == len(inputMultimodal)-1 {
+					result = append(result, input.Input{Token: 13}) // [IMG_END]
+				} else {
+					result = append(result, input.Input{Token: 12}) // [IMG_BREAK]
+				}
+			}
 		}
 	}

--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@ -41,40 +41,29 @@ type SelfAttention struct {
 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	ropeType := uint32(0)
-	// Get head dimension - use explicit value if available, otherwise calculate
 	headDim := opts.headDim
 	if headDim == 0 {
 		headDim = opts.hiddenSize / opts.numHeads
 	}

-	// Query projection and reshape
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
 	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	// Key projection and reshape
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
 	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)

-	// Value projection and reshape
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	// Attention computation
-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-
-	// Reshape attention output for final projection
-	outputDim := headDim * opts.numHeads
-	kqv = kqv.Reshape(ctx, outputDim, batchSize)
-
-	// Apply output projection
+	kqv := nn.Attention(ctx, q, k, v, 1.0/math.Sqrt(float64(headDim)), cache)
+	kqv = kqv.Reshape(ctx, headDim*opts.numHeads, batchSize)
 	return sa.Output.Forward(ctx, kqv)
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@ -117,10 +106,14 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	// Process text inputs
 	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)

-	// Process through text transformer layers
+	// image embeddings
+	for _, image := range batch.Multimodal {
+		visionOutputs := image.Multimodal.(ml.Tensor)
+		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Dim(0), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
+	}
+
 	for i, layer := range m.Layers {
 		cache.SetLayer(i)

--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@ -1,7 +1,7 @@
 package mistral3

 import (
-	"fmt"
+	"image"
 	"math"

 	"github.com/ollama/ollama/ml"
@ -14,32 +14,12 @@ type PatchMerger struct {
 	MergingLayer *nn.Linear `gguf:"merging_layer"`
 }

-func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
-	// TODO: pass these in
-	w := 110
-	h := 74
-	// tokensPerImage := w * h
+func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, size image.Point, spatialMergeSize int) ml.Tensor {
 	d := visionOutputs.Dim(0)
-
-	// TODO: handle multiple images, this currently assumes one
-	// fmt.Println("patchmerger visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
-
-	// Reshape to [h, w, hidden_size]
-	imageGrid := visionOutputs.Reshape(ctx, h, w, d)
-	// fmt.Println("imageGrid", "shape", imageGrid.Shape(), "data", ml.Dump(ctx, imageGrid))
-
-	// TODO: load from config
-	spatialMergeSize := 2
+	imageGrid := visionOutputs.Reshape(ctx, size.Y, size.X, d)
 	kernel := ctx.Input().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d, 1)
-	// fmt.Println("kernel", "shape", kernel.Shape(), "data", ml.Dump(ctx, kernel))
-
 	patches := kernel.IM2Col(ctx, imageGrid, spatialMergeSize, spatialMergeSize, 0, 0, 1, 1)
-	// fmt.Println("patches", "shape", patches.Shape(), "data", ml.Dump(ctx, patches))
-
-	// fmt.Println("creating reshaped", d*spatialMergeSize*spatialMergeSize, "x", patches.Dim(1)*patches.Dim(2))
 	reshaped := patches.Reshape(ctx, d*spatialMergeSize*spatialMergeSize, patches.Dim(1)*patches.Dim(2))
-	// fmt.Println("reshaped", "shape", reshaped.Shape(), "data", ml.Dump(ctx, reshaped))
-
 	return pm.MergingLayer.Forward(ctx, reshaped)
 }

@ -50,23 +30,24 @@ type MultiModalProjector struct {
 	PatchMerger *PatchMerger `gguf:"patch_merger"`

 	spatialMergeSize int
-	imageTokenIndex  int
-	hasBias          bool
+	eps              float32
+	patchSize        int
 }

-func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
-	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
-	visionOutputs = p.PatchMerger.Forward(ctx, visionOutputs)
+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, size image.Point) (ml.Tensor, image.Point) {
+	visionOutputs = p.Norm.Forward(ctx, visionOutputs, p.eps)
+	patchSizes := image.Point{size.X / p.patchSize, size.Y / p.patchSize}
+	visionOutputs = p.PatchMerger.Forward(ctx, visionOutputs, patchSizes, p.spatialMergeSize)
 	visionOutputs = p.Linear1.Forward(ctx, visionOutputs)
 	visionOutputs = visionOutputs.GELU(ctx)
-	return p.Linear2.Forward(ctx, visionOutputs)
+	return p.Linear2.Forward(ctx, visionOutputs), image.Point{patchSizes.X / p.spatialMergeSize, patchSizes.Y / p.spatialMergeSize}
 }

 func newMultiModalProjector(c ml.Config) *MultiModalProjector {
 	return &MultiModalProjector{
 		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
-		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
-		hasBias:          c.Bool("mm.projector_bias", false),
+		eps:              c.Float("text_config.rms_norm_eps", 1e-5),
+		patchSize:        int(c.Uint("vision.patch_size", 14)),
 	}
 }

@ -115,9 +96,7 @@ type VisionEncoderLayer struct {

 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	residual := hiddenState
-
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	fmt.Println("after attention norm", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState, ml.DumpOptions{Items: 3, Precision: 6}))
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, opts)
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
@ -149,22 +128,23 @@ type VisionModel struct {
 }

 func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
-	numPatchesH := pixelValues.Dim(1) / m.patchSize
 	numPatchesW := pixelValues.Dim(0) / m.patchSize
-	numPatches := numPatchesH * numPatchesW
+	numPatchesH := pixelValues.Dim(1) / m.patchSize
+	numPatches := numPatchesW * numPatchesH
 	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.VisionModelOptions.eps)

-	totalPositions := numPatchesH * numPatchesW
-	positions := make([]int32, totalPositions*4)
-
+	// Prepare position IDs for 2D rope
+	positions := make([]int32, numPatches*4)
 	for h := 0; h < numPatchesH; h++ {
 		for w := 0; w < numPatchesW; w++ {
-			index := h*numPatchesW + w
-			positions[totalPositions+index] = int32(h)
-			positions[totalPositions*2+index] = int32(w)
+			idx := h*numPatchesW + w
+			positions[idx] = 0                     // time (unused)
+			positions[numPatches+idx] = int32(h)   // height
+			positions[numPatches*2+idx] = int32(w) // width
+			positions[numPatches*3+idx] = 0        // extra (unused)
 		}
 	}

@ -177,8 +157,6 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		hiddenState = layer.Forward(ctx, hiddenState, positionIDs, m.VisionModelOptions)
 	}

-	// fmt.Println("after layers", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
-
 	return hiddenState
 }

--- a/model/process_text_test.go
+++ b/model/process_text_test.go
@ -209,322 +209,6 @@ func TestLlama(t *testing.T) {
 	})
 }

-// tekken loads the Tekken tokenizer for testing
-func tekken(t testing.TB) TextProcessor {
-	t.Helper()
-
-	// Load tokenizer config from mistral-small
-	tokenizerConfigPath := filepath.Join("testdata", "mistral-small", "tokenizer_config.json")
-	configFile, err := os.Open(tokenizerConfigPath)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer configFile.Close()
-
-	var config struct {
-		AddBosToken bool   `json:"add_bos_token"`
-		AddEosToken bool   `json:"add_eos_token"`
-		BosToken    string `json:"bos_token"`
-		EosToken    string `json:"eos_token"`
-	}
-	if err := json.NewDecoder(configFile).Decode(&config); err != nil {
-		t.Fatal(err)
-	}
-
-	// Load tokenizer.json which contains the vocabulary and other settings
-	tokenizerJsonPath := filepath.Join("testdata", "mistral-small", "tokenizer.json")
-	tokenizerFile, err := os.Open(tokenizerJsonPath)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer tokenizerFile.Close()
-
-	var tokenizerData struct {
-		Model struct {
-			Type   string           `json:"type"`
-			Vocab  map[string]int32 `json:"vocab"`
-			Merges []string         `json:"merges"`
-		} `json:"model"`
-		AddedTokens []struct {
-			Id      int32  `json:"id"`
-			Content string `json:"content"`
-			Special bool   `json:"special"`
-		} `json:"added_tokens"`
-		PreTokenizer struct {
-			Type          string `json:"type"`
-			Pretokenizers []struct {
-				Type    string `json:"type"`
-				Pattern struct {
-					String string `json:"String"`
-				} `json:"pattern"`
-				Behavior string `json:"behavior"`
-			} `json:"pretokenizers"`
-		} `json:"pre_tokenizer"`
-	}
-	if err := json.NewDecoder(tokenizerFile).Decode(&tokenizerData); err != nil {
-		t.Fatal(err)
-	}
-
-	// Extract the pattern from pre_tokenizer if available
-	var pattern string
-	if tokenizerData.PreTokenizer.Type == "Sequence" && len(tokenizerData.PreTokenizer.Pretokenizers) > 0 {
-		pattern = tokenizerData.PreTokenizer.Pretokenizers[0].Pattern.String
-	}
-
-	// Combine regular vocab and added tokens
-	vocab := tokenizerData.Model.Vocab
-
-	// Add special tokens from added_tokens
-	for _, token := range tokenizerData.AddedTokens {
-		vocab[token.Content] = token.Id
-	}
-
-	// Create vocabulary arrays
-	maxId := int32(-1)
-	for _, id := range vocab {
-		if id > maxId {
-			maxId = id
-		}
-	}
-
-	vocabSize := int(maxId + 1)
-	types := make([]uint32, vocabSize)
-	tokens := make([]string, vocabSize)
-	scores := make([]float32, vocabSize)
-
-	for token, id := range vocab {
-		tokens[id] = token
-		types[id] = TOKEN_TYPE_NORMAL
-
-		// Assign appropriate token types for special tokens
-		if token == "<s>" {
-			types[id] = TOKEN_TYPE_CONTROL
-		} else if token == "</s>" {
-			types[id] = TOKEN_TYPE_CONTROL
-		} else if token == "[INST]" || token == "[/INST]" {
-			types[id] = TOKEN_TYPE_CONTROL
-		}
-	}
-
-	// In Tekken, we don't need to load merges separately as they're part of the model
-	var merges []string
-
-	// Create vocabulary object
-	vocabObj := &Vocabulary{
-		Values: tokens,
-		Types:  types,
-		Scores: scores,
-		Merges: merges,
-		BOS:    vocab[config.BosToken],
-		EOS:    vocab[config.EosToken],
-		AddBOS: config.AddBosToken,
-		AddEOS: config.AddEosToken,
-	}
-
-	// Use pattern from tokenizer.json if available
-	if pattern != "" {
-		// Ensure pattern has proper escaping for Go regexp
-		pattern = strings.ReplaceAll(pattern, "p{", "\\p{")
-		return NewBytePairEncoding(pattern, vocabObj)
-	}
-
-	// Fallback pattern if not found
-	return NewBytePairEncoding(
-		`\p{L}+|\p{N}+|[^\s\p{L}\p{N}]+|\s+`,
-		vocabObj,
-	)
-}
-
-func TestTekken(t *testing.T) {
-	// Skip if the test data isn't available
-	if _, err := os.Stat(filepath.Join("testdata", "mistral-small")); os.IsNotExist(err) {
-		t.Skip("Mistral-small test data not available")
-	}
-
-	tokenizer := tekken(t)
-
-	t.Run("whitespace_handling", func(t *testing.T) {
-		t.Parallel()
-
-		// The key difference from SentencePiece is that Tekken doesn't prepend whitespace
-		cases := []struct {
-			input    string
-			expected string
-		}{
-			{" hello", " hello"},
-			{"hello ", "hello "},
-			{"hello world", "hello world"},
-			{" hello world ", " hello world "},
-		}
-
-		for _, tc := range cases {
-			ids, err := tokenizer.Encode(tc.input, false)
-			if err != nil {
-				t.Errorf("Failed to encode %q: %v", tc.input, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("Failed to decode tokens for %q: %v", tc.input, err)
-				continue
-			}
-
-			if decoded != tc.expected {
-				t.Errorf("Whitespace handling: got %q, want %q", decoded, tc.expected)
-			}
-		}
-	})
-
-	t.Run("chat_templates", func(t *testing.T) {
-		t.Parallel()
-
-		// Test the Tekken chat template format which doesn't have spaces after special tokens
-		templates := []struct {
-			input       string
-			expectSpace bool // whether we expect a space after special tokens
-		}{
-			{"<s>[INST]user message[/INST]", false},
-			{"<s>[INST] user message[/INST]", true},
-			{"<s>[INST]user message [/INST]", true},
-		}
-
-		for _, tc := range templates {
-			ids, err := tokenizer.Encode(tc.input, false)
-			if err != nil {
-				t.Errorf("Failed to encode %q: %v", tc.input, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("Failed to decode tokens for %q: %v", tc.input, err)
-				continue
-			}
-
-			// Check if there's a space after special tokens
-			hasSpaceAfterINST := strings.Contains(decoded, "[INST] ")
-
-			if hasSpaceAfterINST != tc.expectSpace {
-				t.Errorf("Chat template space handling: got space=%v, want space=%v for %q",
-					hasSpaceAfterINST, tc.expectSpace, tc.input)
-			}
-		}
-	})
-
-	t.Run("special_tokens", func(t *testing.T) {
-		t.Parallel()
-
-		// Test how Tekken handles special tokens
-		cases := []struct {
-			input    string
-			expected []string // We'll check if these tokens are in the decoded output
-		}{
-			{"<s>[INST]hello[/INST]", []string{"<s>", "[INST]", "hello", "[/INST]"}},
-			{"[INST]hello[/INST]</s>", []string{"[INST]", "hello", "[/INST]", "</s>"}},
-			{"<s>[INST]hello[/INST]</s>[INST]again[/INST]", []string{"<s>", "[INST]", "hello", "[/INST]", "</s>", "[INST]", "again", "[/INST]"}},
-		}
-
-		for _, tc := range cases {
-			ids, err := tokenizer.Encode(tc.input, false)
-			if err != nil {
-				t.Errorf("Failed to encode %q: %v", tc.input, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("Failed to decode tokens for %q: %v", tc.input, err)
-				continue
-			}
-
-			for _, expected := range tc.expected {
-				if !strings.Contains(decoded, expected) {
-					t.Errorf("Special token handling: %q missing in decoded output %q", expected, decoded)
-				}
-			}
-		}
-	})
-
-	t.Run("vocabulary_coverage", func(t *testing.T) {
-		t.Parallel()
-
-		// Tekken has a larger vocabulary, so test coverage of various token types
-		samples := []string{
-			"Hello world!",
-			"This is a test of the Tekken tokenizer.",
-			"It has a considerably larger vocabulary size.",
-			"Special characters: !@#$%^&*()",
-			"Numbers: 1234567890",
-			"Multiple languages: こんにちは 你好 안녕하세요",
-			"Code snippets: def function(): return True",
-		}
-
-		for _, sample := range samples {
-			ids, err := tokenizer.Encode(sample, false)
-			if err != nil {
-				t.Errorf("Failed to encode %q: %v", sample, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("Failed to decode tokens for %q: %v", sample, err)
-				continue
-			}
-
-			if decoded != sample {
-				t.Errorf("Vocabulary coverage: got %q, want %q", decoded, sample)
-			}
-		}
-	})
-
-	t.Run("splitting_behavior", func(t *testing.T) {
-		t.Parallel()
-
-		// Test the splitting behavior which might differ from SentencePiece
-		cases := map[string][]string{
-			"Hello World!": {"Hello", " World", "!"},
-			"user message": {"user", " message"},
-			"[INST]hello":  {"[INST]", "hello"},
-			"hello[/INST]": {"hello", "[/INST]"},
-		}
-
-		for s, want := range cases {
-			got := slices.Collect(tokenizer.(*BytePairEncoding).split(s))
-			if diff := cmp.Diff(want, got); diff != "" {
-				t.Errorf("Splitting behavior no match (-want +got):\n%s", diff)
-			}
-		}
-	})
-
-	t.Run("full_chat_sequence", func(t *testing.T) {
-		t.Parallel()
-
-		// Test a complete chat sequence with Tekken's format
-		chatSequence := "<s>[INST]user message[/INST]assistant message</s>[INST]new user message[/INST]"
-
-		ids, err := tokenizer.Encode(chatSequence, false)
-		if err != nil {
-			t.Fatalf("Failed to encode chat sequence: %v", err)
-		}
-
-		decoded, err := tokenizer.Decode(ids)
-		if err != nil {
-			t.Fatalf("Failed to decode chat sequence tokens: %v", err)
-		}
-
-		// In Tekken, the whitespace shouldn't be added after special tokens
-		if strings.Contains(decoded, "[INST] ") {
-			t.Errorf("Tekken chat sequence has unexpected space after [INST]: %q", decoded)
-		}
-
-		if strings.Contains(decoded, "[/INST] ") {
-			t.Errorf("Tekken chat sequence has unexpected space after [/INST]: %q", decoded)
-		}
-	})
-}
-
 func BenchmarkBytePairEncoding(b *testing.B) {
 	tokenizer := llama(b)
 	bts, err := os.ReadFile(filepath.Join("testdata", "war-and-peace.txt"))
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -182,10 +182,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, *
 			return nil, nil, err
 		}

-		for _, t := range tokens {
-			decoded, _ := s.model.(model.TextProcessor).Decode([]int32{t})
-			fmt.Println("token", t, "decoded", decoded)
-		}
 		for _, t := range tokens {
 			inputs = append(inputs, input.Input{Token: t})
 		}