Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)

* increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc
2025-07-08 21:00:48 +02:00 · 2024-01-10 19:08:51 -05:00
parent f83881390f
commit b24e8d17b2
2 changed files with 19 additions and 12 deletions
--- a/llm/llm.go
+++ b/llm/llm.go
@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 				break
 			}

-			// no offloading required
-			if requiredTotal <= available {
-				break
-			}
-
-			// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
-			if requiredAlloc > available {
+			// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
+			// TODO: find the largest GPU and only reserve memory there
+			avgAvailable := available / int64(info.DeviceCount)
+			if requiredAlloc > avgAvailable {
 				log.Printf("not enough vram available, falling back to CPU only")
 				library = "cpu"
 				opts.NumGPU = 0
 				break
 			}

-			available -= requiredAlloc
+			// we don't know which GPU will be used, so estimate
+			// the scratch buffer space on all of them
+			// TODO: allocate less layers to the GPU with the scratch buffer
+			// and more to the others (based on their available memory)
+			available -= requiredAlloc * int64(info.DeviceCount)
+
+			// no offloading required
+			if requiredModel+requiredKv <= available {
+				break
+			}

 			// fill remaining vram with layers
 			log.Println("splitting", available, "of available memory bytes into layers")