Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.
2025-11-11 07:17:26 +01:00 · 2025-10-01 15:12:32 -07:00
parent 6b50f2b9cd
commit bc8909fb38
57 changed files with 3288 additions and 3819 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {

 type LlamaServer interface {
 	ModelPath() string
-	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
+	Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
 	Ping(ctx context.Context) error
 	WaitUntilRunning(ctx context.Context) error
 	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -76,8 +76,11 @@ type LlamaServer interface {
 	Close() error
 	VRAMSize() uint64 // Total VRAM across all GPUs
 	TotalSize() uint64
-	VRAMByGPU(gpuID string) uint64
+	VRAMByGPU(id ml.DeviceID) uint64
 	Pid() int
+	GetPort() int
+	GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
+	HasExited() bool
 }

 // llmServer is an instance of a runner hosting a single model
@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			if gpu.DependencyPath != nil {
 				slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
 				libraryPaths = append(gpu.DependencyPath, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
 			}
 		}

@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a

 		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))

-		envWorkarounds := []string{}
-		for _, gpu := range gpus {
-			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
-		}
 		// Always filter down the set of GPUs in case there are any unsupported devices that might crash
-		envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
+		envWorkarounds := gpus.GetVisibleDevicesEnv()
 		pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))

 		// Update or add the path variable with our adjusted version
@@ -496,7 +496,7 @@ type LoadResponse struct {

 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")

-func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory := systemInfo.System.TotalMemory
 	systemFreeMemory := systemInfo.System.FreeMemory
@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
 			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return ErrLoadRequiredFull
+			return nil, ErrLoadRequiredFull
 		}
 	}

@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
 		switch {
-		case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
+		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			s.options.NumGPU = 0
-		case gpus[0].Library != "metal" && s.estimate.Layers == 0:
+		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
-			gpus = discover.GetCPUInfo()
+			gpus = discover.GpuInfoList{discover.GetCPUInfo()}
 		case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
 			s.options.NumGPU = s.estimate.Layers
 		}
@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
 		if systemMemoryRequired > available {
 			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
-			return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
 		}
 	}

@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 		// mmap has issues with partial offloading on metal
 		for _, g := range gpus {
-			if g.Library == "metal" &&
+			if g.Library == "Metal" &&
 				uint64(s.options.NumGPU) > 0 &&
 				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
 				s.options.UseMMap = new(bool)
@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		// Windows CUDA should not use mmap for best performance
 		// Linux  with a model larger than free space, mmap leads to thrashing
 		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
+		if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
 			(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
 			(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
 			(s.options.UseMMap != nil && !*s.options.UseMMap) {
@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	// On the Ollama engine, we can print out a summary of the memory allocations.
@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi

 	if !resp.Success {
 		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
-		return errors.New("failed to allocate memory for model")
+		return nil, errors.New("failed to allocate memory for model")
 	}

 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
 	if s.textProcessor == nil {
-		return s.WaitUntilRunning(ctx)
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 	} else {
-		return nil
+		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
 	}
 }

@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu

 	gpuLayers := make(ml.GPULayersList, len(gpus))
 	for i := range gpuLayers {
-		gpuLayers[i].ID = gpus[i].ID
+		gpuLayers[i].DeviceID = gpus[i].DeviceID
 	}

 	var sum float32
@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
 //
 // This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
 // allowing for faster iteration, but may return less information.
-func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
+//
+// Returns the list of GPU IDs that were used in the final allocation on success
+func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
 	var success bool
 	defer func() {
 		if !success {
@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 			if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
 				available = 0
 			}
-			slog.Info("gpu memory", "id", gpu.ID,
+			slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
 				"available", format.HumanBytes2(available),
 				"free", format.HumanBytes2(gpu.FreeMemory),
 				"minimum", format.HumanBytes2(gpu.MinimumMemory),
@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ

 	gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
-		return err
+		return nil, err
 	}

 nextOperation:
@@ -710,7 +712,7 @@ nextOperation:
 			s.loadRequest.GPULayers = gpuLayers
 			resp, err := s.initModel(ctx, s.loadRequest, operation)
 			if err != nil {
-				return err
+				return nil, err
 			}

 			resp.Memory.Log(slog.LevelDebug)
@@ -722,7 +724,7 @@ nextOperation:
 			for {
 				newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 				if err != nil {
-					return err
+					return nil, err
 				}

 				slog.Debug("new layout created", "layers", newGPULayers)
@@ -756,7 +758,7 @@ nextOperation:
 						newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
 						s.options.NumGPU = -1
 						if err != nil {
-							return err
+							return nil, err
 						}

 						slog.Debug("new layout created", "layers", newGPULayers)
@@ -764,7 +766,7 @@ nextOperation:
 						s.loadRequest.GPULayers = newGPULayers
 						resp, err = s.initModel(ctx, s.loadRequest, operation)
 						if err != nil {
-							return err
+							return nil, err
 						}

 						resp.Memory.Log(slog.LevelDebug)
@@ -773,7 +775,7 @@ nextOperation:
 						if resp.Success {
 							verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
 							if err != nil {
-								return err
+								return nil, err
 							}

 							slog.Debug("verifying layout", "layers", verifyGPULayers)
@@ -798,7 +800,7 @@ nextOperation:
 				}

 				if s.options.NumGPU >= 0 {
-					return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
+					return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
 				}

 				// Memory allocation failed even though we created a layout that we thought should
@@ -808,7 +810,7 @@ nextOperation:
 				// space.
 				if backoff > 1 {
 					slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
-					return errors.New("memory layout cannot be allocated")
+					return nil, errors.New("memory layout cannot be allocated")
 				} else if backoff == 0 {
 					backoff = 0.01
 				} else {
@@ -823,7 +825,7 @@ nextOperation:
 	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
-		return err
+		return nil, err
 	}

 	success = resp.Success
@@ -831,10 +833,27 @@ nextOperation:

 	if !success {
 		slog.Warn("failed to commit memory for model", "memory", resp.Memory)
-		return errors.New("failed to commit memory for model")
+		return nil, errors.New("failed to commit memory for model")
 	}

-	return nil
+	return uniqueDeviceIDs(gpuLayers), nil
+}
+
+func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
+	devices := []ml.DeviceID{}
+	for _, layer := range gpuLayers {
+		new := true
+		for _, ID := range devices {
+			if layer.DeviceID == ID {
+				new = false
+				break
+			}
+		}
+		if new {
+			devices = append(devices, layer.DeviceID)
+		}
+	}
+	return devices
 }

 // createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 		for i := range gl {
 			found := false
 			for j := range memory.GPUs {
-				if gl[i].ID == memory.GPUs[j].ID {
+				if gl[i].DeviceID == memory.GPUs[j].DeviceID {
 					if memory.GPUs[j].Graph != 0 {
 						lastUsedGPU = i
 					}
@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 						gl[i].FreeMemory = 0
 					}

-					slog.Debug("available gpu", "id", gl[i].ID,
+					slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
 						"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
 						"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
 						"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 	var vramSize uint64
 	for _, gl := range gpuLayers {
 		for _, gpu := range memory.GPUs {
-			if gl.ID == gpu.ID {
+			if gl.DeviceID == gpu.DeviceID {
 				vramSize += gpu.Graph
 				break
 			}
@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
 // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
 func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
 	device := len(gpus) - 1
-	gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
+	gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
 	freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
 	for i := len(layers) - 1; i >= 0; i-- {
 		if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
 			if device < 0 {
 				return gpuLayers
 			}
-			gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
+			gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
 			freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
 		}
 	}
@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int {
 	return -1
 }

+func (s *llmServer) GetPort() int {
+	return s.port
+}
+
+func (s *llmServer) HasExited() bool {
+	if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
+		return true
+	}
+	return false
+}
+
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -1386,7 +1416,7 @@ type CompletionResponse struct {

 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
-	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
+	logutil.Trace("completion request", "prompt", req.Prompt)

 	if len(req.Format) > 0 {
 		switch string(req.Format) {
@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct {
 }

 func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
-	slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
+	logutil.Trace("embedding request", "input", input)

 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		if errors.Is(err, context.Canceled) {
@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
 	return s.estimate.TotalSize
 }

-func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	for i, gpu := range s.gpus {
-		if gpu.ID == gpuID {
+		if gpu.DeviceID == id {
 			if i < len(s.estimate.GPUSizes) {
 				return s.estimate.GPUSizes[i]
 			}
@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
 	return 0
 }

+func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	slog.Debug("llamarunner free vram reporting not supported")
+	return nil
+}
+
 func (s *ollamaServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 {
 	return mem
 }

-func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
+func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}

 	for _, g := range s.mem.GPUs {
-		if g.ID == gpuID {
+		if g.DeviceID == id {
 			return g.Size()
 		}
 	}

 	return 0
 }
+
+func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
+	devices, err := discover.GetDevicesFromRunner(ctx, s)
+	if err != nil {
+		if s.cmd != nil && s.cmd.ProcessState == nil {
+			// Still running but hit an error, log
+			slog.Debug("failure refreshing GPU information", "error", err)
+		}
+		// else no longer running so suppress logging as a failure is expected
+	}
+	return devices
+}