From 033cec232a7e6702d6a79ca22ca81200fad5873b Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 12 Mar 2025 14:18:06 -0700 Subject: [PATCH 1/5] count gemma3 vision tensors --- fs/ggml/ggml.go | 8 ++++++++ llm/memory.go | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index d32296d9c..00392b4af 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -611,6 +611,14 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { embeddingLength*numPatches*maxNumTiles + 9*embeddingLength*numPaddedPatches*maxNumTiles + numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) + case "gemma3": + for name, layer := range llm.Tensors().GroupLayers() { + if strings.HasPrefix(name, "v.") { + for _, tensor := range layer { + weights += tensor.Size() + } + } + } } return weights, graphSize } diff --git a/llm/memory.go b/llm/memory.go index 40104eca9..ac830ee84 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -218,8 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { layerSize = blk.Size() layerSize += kv / f.KV().BlockCount() + memoryWeights += blk.Size() } - memoryWeights += layerSize if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { // Stop allocating on GPU(s) once we hit the users target NumGPU @@ -376,7 +376,7 @@ func (m MemoryEstimate) LogValue() slog.Value { // memory of the weights "total", format.HumanBytes2(m.memoryWeights), // memory of repeating layers - "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput), + "repeating", format.HumanBytes2(m.memoryWeights), // memory of non-repeating layers "nonrepeating", format.HumanBytes2(m.memoryLayerOutput), ), From d2ec22371edba325903588b9515dd94b15b80d76 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 12 Mar 2025 16:08:24 -0700 Subject: [PATCH 2/5] count all vision tensors --- fs/ggml/ggml.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 00392b4af..da3ee0a79 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -579,12 +579,16 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO } func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { + for name, layer := range llm.Tensors().GroupLayers() { + if strings.HasPrefix(name, "v.") { + for _, tensor := range layer { + weights += tensor.Size() + } + } + } + switch llm.KV().Architecture() { case "mllama": - for _, layer := range llm.Tensors().GroupLayers()["v"] { - weights += layer.Size() - } - kv := func(n string) uint64 { if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok { return uint64(v) @@ -611,15 +615,8 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { embeddingLength*numPatches*maxNumTiles + 9*embeddingLength*numPaddedPatches*maxNumTiles + numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) - case "gemma3": - for name, layer := range llm.Tensors().GroupLayers() { - if strings.HasPrefix(name, "v.") { - for _, tensor := range layer { - weights += tensor.Size() - } - } - } } + return weights, graphSize } From a422ba39c94adc870da84e5fa442c0bf81c77f27 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 13 Mar 2025 14:41:57 -0700 Subject: [PATCH 3/5] roughly count gemma3 graph the largest operation is by far (q @ k) so just count that for simplicity --- fs/ggml/ggml.go | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index da3ee0a79..be1dffe0d 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -587,34 +587,32 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { } } + imageSize := uint64(llm.KV().Uint("vision.image_size")) + patchSize := uint64(llm.KV().Uint("vision.patch_size")) + + numPatches := (imageSize / patchSize) * (imageSize / patchSize) + if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { + numPatches++ + } + + headCount := uint64(llm.KV().Uint("vision.attention.head_count")) + switch llm.KV().Architecture() { case "mllama": - kv := func(n string) uint64 { - if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok { - return uint64(v) - } - - return 0 - } - - imageSize := kv("image_size") - - maxNumTiles := kv("max_num_tiles") - embeddingLength := kv("embedding_length") - headCount := kv("attention.head_count") - - numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size")) - if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { - numPatches++ - } numPaddedPatches := numPatches + 8 - (numPatches%8)%8 + maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles")) + numChannels := uint64(llm.KV().Uint("vision.num_channels")) + embeddingLength := uint64(llm.KV().Uint("vision.embedding_length")) + graphSize = 4 * (8 + - imageSize*imageSize*kv("num_channels")*maxNumTiles + + imageSize*imageSize*numChannels*maxNumTiles + embeddingLength*numPatches*maxNumTiles + 9*embeddingLength*numPaddedPatches*maxNumTiles + numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) + case "gemma3": + graphSize = 4 * (numPatches * numPatches * headCount) } return weights, graphSize From 65b88c544f08ce3e5b1d193e82b72735095f795c Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 13 Mar 2025 15:05:42 -0700 Subject: [PATCH 4/5] fix divide by zero --- fs/ggml/ggml.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index be1dffe0d..2c04559f2 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -579,6 +579,10 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO } func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { + if llm.KV().Uint("vision.block_count") == 0 { + return + } + for name, layer := range llm.Tensors().GroupLayers() { if strings.HasPrefix(name, "v.") { for _, tensor := range layer { @@ -589,6 +593,12 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { imageSize := uint64(llm.KV().Uint("vision.image_size")) patchSize := uint64(llm.KV().Uint("vision.patch_size")) + if patchSize == 0 { + slog.Warn("unknown patch size for vision model") + return + } + + numChannels := uint64(llm.KV().Uint("vision.num_channels")) numPatches := (imageSize / patchSize) * (imageSize / patchSize) if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { @@ -596,15 +606,13 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { } headCount := uint64(llm.KV().Uint("vision.attention.head_count")) + embeddingLength := uint64(llm.KV().Uint("vision.embedding_length")) switch llm.KV().Architecture() { case "mllama": - numPaddedPatches := numPatches + 8 - (numPatches%8)%8 maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles")) - numChannels := uint64(llm.KV().Uint("vision.num_channels")) - embeddingLength := uint64(llm.KV().Uint("vision.embedding_length")) graphSize = 4 * (8 + imageSize*imageSize*numChannels*maxNumTiles + @@ -612,7 +620,9 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { 9*embeddingLength*numPaddedPatches*maxNumTiles + numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) case "gemma3": - graphSize = 4 * (numPatches * numPatches * headCount) + graphSize = 4 * (imageSize*imageSize*numChannels + + embeddingLength*patchSize + + numPatches*numPatches*headCount) } return weights, graphSize From 8d76fa23ef058d98cb66ad9c3ecc5fff6d380ca1 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 13 Mar 2025 16:53:22 -0700 Subject: [PATCH 5/5] count non-repeating vision layers --- fs/ggml/ggml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 2c04559f2..dd84380b5 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -584,7 +584,7 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { } for name, layer := range llm.Tensors().GroupLayers() { - if strings.HasPrefix(name, "v.") { + if name == "v" || strings.HasPrefix(name, "v.") { for _, tensor := range layer { weights += tensor.Size() }