This commit is contained in:
Michael Yang 2025-03-24 12:16:58 -07:00
parent dce7cf2a1a
commit 863ba57477
6 changed files with 19 additions and 14 deletions

View File

@ -169,7 +169,7 @@ var (
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 2048)
ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 8<<10)
)
func String(s string) func() string {

View File

@ -165,6 +165,7 @@ type Tensor interface {
Concat(ctx Context, t2 Tensor, dim int) Tensor
Rows(ctx Context, t2 Tensor) Tensor
Copy(ctx Context, t2 Tensor) Tensor
Duplicate(ctx Context) Tensor
}
// ScaledDotProductAttention implements a fused attention

View File

@ -986,10 +986,10 @@ func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, r
}
}
func (t *Tensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
func (t *Tensor) IM2Col(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_im2col(ctx.(*Context).ctx, t.t, weight.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
t: C.ggml_im2col(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
}
}
@ -1061,3 +1061,10 @@ func (t *Tensor) ScaledDotProductAttention(ctx ml.Context, key, value, mask ml.T
return kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
}
}
func (t *Tensor) Duplicate(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_dup(ctx.(*Context).ctx, t.t),
}
}

View File

@ -65,10 +65,9 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)
// split into patches to be sent to the text transformer
var rows []ml.Tensor
for i := 0; i < size.Y; i++ {
view := features.View(ctx, features.Dim(0)*i, features.Dim(0), features.Stride(1), size.X)
rows = append(rows, view)
rows := make([]ml.Tensor, size.Y)
for i := range rows {
rows[i] = features.View(ctx, features.Stride(1)*(i+size.X), features.Dim(0), features.Stride(1), size.X)
}
return rows, nil
@ -88,8 +87,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
} else {
inputMultimodal := inp.Multimodal.([]ml.Tensor)
for i, row := range inputMultimodal {
result = append(result, input.Input{Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.Dim(1)}) // Image data
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Dim(1))...) // [IMG]
result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.Dim(1)}) // Image data
result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Dim(1)-1)...) // [IMG]
if i == len(inputMultimodal)-1 {
result = append(result, input.Input{Token: 13}) // [IMG_END]
} else {

View File

@ -106,14 +106,12 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
}
func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
hiddenState := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)
// image embeddings
for _, image := range batch.Multimodal {
visionOutputs := image.Multimodal.(ml.Tensor)
// TODO (jmorganca): this fails on metal
// TODO (jmorganca): should this be image.Index*hiddenState.Dim(0)
// instead of image.Index*hiddenState.Stride(1)?
ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
}

View File

@ -16,8 +16,8 @@ type PatchMerger struct {
func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, size image.Point, spatialMergeSize int) ml.Tensor {
d := visionOutputs.Dim(0)
imageGrid := visionOutputs.Reshape(ctx, size.Y, size.X, d)
kernel := ctx.Input().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d, 1)
imageGrid := visionOutputs.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Reshape(ctx, size.X, size.Y, d)
kernel := ctx.Input().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d)
patches := kernel.IM2Col(ctx, imageGrid, spatialMergeSize, spatialMergeSize, 0, 0, 1, 1)
reshaped := patches.Reshape(ctx, d*spatialMergeSize*spatialMergeSize, patches.Dim(1)*patches.Dim(2))
return pm.MergingLayer.Forward(ctx, reshaped)