mirror of
https://github.com/ollama/ollama.git
synced 2025-04-16 07:31:35 +02:00
Gemma3 uses sliding windows for its context on 5/6 layers, significantly reducing memory usage but leading to uneven usage across layers, which makes allocation to the correct GPU difficult. We currently estimate very conservatively by assuming all layers are consistent at the max size. Llama3.2-vision is also inconsistent between self attention and cross attention layers - at moment, we calculate the correct total size and then average this across layers. In some cases, this may lead to crashes if a large layer is placed on a GPU sized by the average. This allows memory estimation to calculate per-layer KV cache size and take this account when placing layers onto GPUs. We already do this for weights that vary per-tensor, so this is a logical extension. Fixes #9730 Fixes #9890
132 lines
4.4 KiB
Go
132 lines
4.4 KiB
Go
package llm
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"os"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
func TestEstimateGPULayers(t *testing.T) {
|
|
t.Setenv("OLLAMA_DEBUG", "1")
|
|
t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
|
|
t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048")
|
|
|
|
modelName := "dummy"
|
|
f, err := os.CreateTemp(t.TempDir(), modelName)
|
|
require.NoError(t, err)
|
|
defer f.Close()
|
|
inputLayerCount := 5
|
|
|
|
tensors := []ggml.Tensor{
|
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
|
}
|
|
assert.Len(t, tensors, inputLayerCount+1)
|
|
err = ggml.WriteGGUF(f, ggml.KV{
|
|
"general.architecture": "llama",
|
|
"llama.context_length": uint32(32),
|
|
"llama.embedding_length": uint32(4096),
|
|
"llama.block_count": uint32(inputLayerCount),
|
|
"llama.attention.head_count": uint32(32),
|
|
"llama.attention.head_count_kv": uint32(32),
|
|
"tokenizer.ggml.tokens": []string{" "},
|
|
"tokenizer.ggml.scores": []float32{0},
|
|
"tokenizer.ggml.token_type": []int32{0},
|
|
}, tensors)
|
|
require.NoError(t, err)
|
|
|
|
ggml, err := LoadModel(f.Name(), 0)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Simple CPU scenario
|
|
gpus := []discover.GpuInfo{
|
|
{
|
|
Library: "cpu",
|
|
},
|
|
}
|
|
projectors := []string{}
|
|
opts := api.DefaultOptions()
|
|
t.Run("cpu", func(t *testing.T) {
|
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
|
|
assert.Equal(t, 0, estimate.Layers)
|
|
assert.Equal(t, uint64(0), estimate.Graph)
|
|
})
|
|
|
|
// derived from the dummy ggml file above
|
|
graphPartialOffload := uint64(202377216)
|
|
graphFullOffload := uint64(171968512)
|
|
layerSize := uint64(33554436)
|
|
projectorSize := uint64(0)
|
|
memoryLayerOutput := uint64(4)
|
|
|
|
// Dual CUDA scenario with asymmetry
|
|
gpuMinimumMemory := uint64(2048)
|
|
gpus = []discover.GpuInfo{
|
|
{
|
|
Library: "cuda",
|
|
MinimumMemory: gpuMinimumMemory,
|
|
},
|
|
{
|
|
Library: "cuda",
|
|
MinimumMemory: gpuMinimumMemory,
|
|
},
|
|
}
|
|
// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
|
|
for i, s := range []struct {
|
|
layer0, layer1 uint64
|
|
expect0, expect1 uint64
|
|
}{
|
|
{1, 1, 1, 1},
|
|
{2, 1, 2, 1},
|
|
{2, 2, 2, 2},
|
|
{1, 2, 1, 2},
|
|
{3, 3, 3, 3},
|
|
{4, 4, 3, 3},
|
|
{6, 6, 3, 3},
|
|
{0, 3, 0, 3},
|
|
} {
|
|
t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
|
|
gpus[0].FreeMemory = 0
|
|
gpus[1].FreeMemory = 0
|
|
gpus[0].FreeMemory += projectorSize
|
|
if s.layer0 > 0 {
|
|
gpus[0].FreeMemory += memoryLayerOutput
|
|
} else {
|
|
gpus[1].FreeMemory += memoryLayerOutput
|
|
}
|
|
gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
|
|
gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
|
|
gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
|
gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
|
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
|
|
assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
|
|
assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
|
|
var layerSums uint64
|
|
for _, b := range estimate.GPUSizes {
|
|
layerSums += b
|
|
}
|
|
if estimate.Layers < inputLayerCount+1 {
|
|
assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
|
assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
|
} else {
|
|
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
|
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
|
}
|
|
})
|
|
}
|
|
}
|