mirror of
https://github.com/ollama/ollama.git
synced 2025-07-08 21:00:48 +02:00
Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)
* increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc
This commit is contained in:
22
llm/llm.go
22
llm/llm.go
@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
break
|
||||
}
|
||||
|
||||
// no offloading required
|
||||
if requiredTotal <= available {
|
||||
break
|
||||
}
|
||||
|
||||
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
||||
if requiredAlloc > available {
|
||||
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
|
||||
// TODO: find the largest GPU and only reserve memory there
|
||||
avgAvailable := available / int64(info.DeviceCount)
|
||||
if requiredAlloc > avgAvailable {
|
||||
log.Printf("not enough vram available, falling back to CPU only")
|
||||
library = "cpu"
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
available -= requiredAlloc
|
||||
// we don't know which GPU will be used, so estimate
|
||||
// the scratch buffer space on all of them
|
||||
// TODO: allocate less layers to the GPU with the scratch buffer
|
||||
// and more to the others (based on their available memory)
|
||||
available -= requiredAlloc * int64(info.DeviceCount)
|
||||
|
||||
// no offloading required
|
||||
if requiredModel+requiredKv <= available {
|
||||
break
|
||||
}
|
||||
|
||||
// fill remaining vram with layers
|
||||
log.Println("splitting", available, "of available memory bytes into layers")
|
||||
|
Reference in New Issue
Block a user