update memory calcualtions

count each layer independently when deciding gpu offloading
2025-09-19 18:50:54 +02:00 · 2024-03-18 10:45:22 +01:00
parent d338d70492
commit 91b3e4d282
7 changed files with 125 additions and 89 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -20,6 +20,8 @@ import (
 	"strings"
 	"sync"
 	"unsafe"
+
+	"github.com/ollama/ollama/format"
 )

 type handles struct {
@@ -27,6 +29,11 @@ type handles struct {
 	cudart *C.cudart_handle_t
 }

+const (
+	cudaMinimumMemory = 377 * format.MebiByte
+	rocmMinimumMemory = 377 * format.MebiByte
+)
+
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil

@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
+				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
+				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
+			resp.MinimumMemory = rocmMinimumMemory
 			return resp
 		}
 	}
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
 	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
-		overhead := gpuInfo.FreeMemory / 10
-		gpus := uint64(gpuInfo.DeviceCount)
-		if overhead < gpus*1024*1024*1024 {
-			overhead = gpus * 1024 * 1024 * 1024
-		}
-		// Assigning full reported free memory for Tegras due to OS controlled caching.
-		if CudaTegra != "" {
-			// Setting overhead for non-Tegra devices
-			overhead = 0
-		}
-		avail := int64(gpuInfo.FreeMemory - overhead)
-		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
-		return avail, nil
+		return int64(gpuInfo.FreeMemory), nil
 	}

 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -14,6 +14,9 @@ type GpuInfo struct {
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant,omitempty"`

+	// MinimumMemory represents the minimum memory required to use the GPU
+	MinimumMemory int64 `json:"-"`
+
 	// TODO add other useful attributes about the card here for discovery information
 }