fixes

2025-04-12 13:49:43 +02:00 · 2025-03-24 12:16:58 -07:00 · 2025-03-24 12:16:58 -07:00 · 863ba57477
commit 863ba57477
parent dce7cf2a1a
6 changed files with 19 additions and 14 deletions
--- a/envconfig/config.go
+++ b/envconfig/config.go
@ -169,7 +169,7 @@ var (
 	// Enable the new Ollama engine
 	NewEngine = Bool("OLLAMA_NEW_ENGINE")
 	// ContextLength sets the default context length
-	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
+	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 8<<10)
 )

 func String(s string) func() string {
--- a/ml/backend.go
+++ b/ml/backend.go
@ -165,6 +165,7 @@ type Tensor interface {
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
 	Rows(ctx Context, t2 Tensor) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
+	Duplicate(ctx Context) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@ -986,10 +986,10 @@ func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, r
 	}
 }

-func (t *Tensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
 	return &Tensor{
 		b: t.b,
-		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, weight.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
+		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
 	}
 }

@ -1061,3 +1061,10 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.T
 		return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	}
 }
+
+func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_dup(ctx.(*Context).ctx, t.t),
+	}
+}
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@ -65,10 +65,9 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	var rows []ml.Tensor
-	for i := 0; i < size.Y; i++ {
-		view := features.View(ctx, features.Dim(0)*i, features.Dim(0), features.Stride(1), size.X)
-		rows = append(rows, view)
+	rows := make([]ml.Tensor, size.Y)
+	for i := range rows {
+		rows[i] = features.View(ctx, features.Stride(1)*(i+size.X), features.Dim(0), features.Stride(1), size.X)
 	}

 	return rows, nil
@ -88,8 +87,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 		} else {
 			inputMultimodal := inp.Multimodal.([]ml.Tensor)
 			for i, row := range inputMultimodal {
-				result = append(result, input.Input{Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.Dim(1)}) // Image data
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Dim(1))...)                                // [IMG]
+				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.Dim(1)}) // Image data
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Dim(1)-1)...)                                         // [IMG]
 				if i == len(inputMultimodal)-1 {
 					result = append(result, input.Input{Token: 13}) // [IMG_END]
 				} else {
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@ -106,14 +106,12 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	// image embeddings
 	for _, image := range batch.Multimodal {
 		visionOutputs := image.Multimodal.(ml.Tensor)
 		// TODO (jmorganca): this fails on metal
-		// TODO (jmorganca): should this be image.Index*hiddenState.Dim(0)
-		// instead of image.Index*hiddenState.Stride(1)?
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
 	}

--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@ -16,8 +16,8 @@ type PatchMerger struct {

 func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, size image.Point, spatialMergeSize int) ml.Tensor {
 	d := visionOutputs.Dim(0)
-	imageGrid := visionOutputs.Reshape(ctx, size.Y, size.X, d)
-	kernel := ctx.Input().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d, 1)
+	imageGrid := visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Reshape(ctx, size.X, size.Y, d)
+	kernel := ctx.Input().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d)
 	patches := kernel.IM2Col(ctx, imageGrid, spatialMergeSize, spatialMergeSize, 0, 0, 1, 1)
 	reshaped := patches.Reshape(ctx, d*spatialMergeSize*spatialMergeSize, patches.Dim(1)*patches.Dim(2))
 	return pm.MergingLayer.Forward(ctx, reshaped)