mirror of
https://github.com/ollama/ollama.git
synced 2025-07-14 22:05:09 +02:00
fs/ggml: add multiplier in graph estimates (#11208)
This commit is contained in:
@ -555,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
// vocab graph
|
// vocab graph
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
case "gemma", "gemma2", "gemma3":
|
case "gemma", "gemma2", "gemma3", "gemma3n":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||||
@ -568,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
|
|||||||
embedding*embeddingHeadsK*heads*9/16,
|
embedding*embeddingHeadsK*heads*9/16,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if f.KV().Architecture() == "gemma3n" {
|
||||||
|
fullOffload *= 4
|
||||||
|
partialOffload *= 4
|
||||||
|
}
|
||||||
|
|
||||||
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
|
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
|
||||||
// engine. Gemma3 always uses the Ollama engine.
|
// engine. Gemma3 always uses the Ollama engine.
|
||||||
if f.KV().Architecture() == "gemma3" {
|
if f.KV().Architecture() == "gemma3" {
|
||||||
|
Reference in New Issue
Block a user