Use runners for GPU discovery (#12090)

This revamps how we discover GPUs in the system by leveraging the Ollama
runner.  This should eliminate inconsistency between our GPU discovery and the
runners capabilities at runtime, particularly for cases where we try to filter
out unsupported GPUs.  Now the runner does that implicitly based on the actual
device list.  In some cases free VRAM reporting can be unreliable which can
leaad to scheduling mistakes, so this also includes a patch to leverage more
reliable VRAM reporting libraries if available.

Automatic workarounds have been removed as only one GPU leveraged this, which
is now documented. This GPU will soon fall off the support matrix with the next
ROCm bump.

Additional cleanup of the scheduler and discovery packages can be done in the
future once we have switched on the new memory management code, and removed
support for the llama runner.
This commit is contained in:
Daniel Hiltgen
2025-10-01 15:12:32 -07:00
committed by GitHub
parent 6b50f2b9cd
commit bc8909fb38
57 changed files with 3288 additions and 3819 deletions

View File

@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
type LlamaServer interface {
ModelPath() string
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
Ping(ctx context.Context) error
WaitUntilRunning(ctx context.Context) error
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
@@ -76,8 +76,11 @@ type LlamaServer interface {
Close() error
VRAMSize() uint64 // Total VRAM across all GPUs
TotalSize() uint64
VRAMByGPU(gpuID string) uint64
VRAMByGPU(id ml.DeviceID) uint64
Pid() int
GetPort() int
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
HasExited() bool
}
// llmServer is an instance of a runner hosting a single model
@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if gpu.DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
}
}
@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := []string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
}
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
envWorkarounds := gpus.GetVisibleDevicesEnv()
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
// Update or add the path variable with our adjusted version
@@ -496,7 +496,7 @@ type LoadResponse struct {
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
systemInfo := discover.GetSystemInfo()
systemTotalMemory := systemInfo.System.TotalMemory
systemFreeMemory := systemInfo.System.FreeMemory
@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
} else {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return ErrLoadRequiredFull
return nil, ErrLoadRequiredFull
}
}
@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if len(gpus) > 1 || gpus[0].Library != "cpu" {
switch {
case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
// disable partial offloading when model is greater than total system memory as this
// can lead to locking up the system
s.options.NumGPU = 0
case gpus[0].Library != "metal" && s.estimate.Layers == 0:
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit
gpus = discover.GetCPUInfo()
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
s.options.NumGPU = s.estimate.Layers
}
@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
if systemMemoryRequired > available {
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
}
}
@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// mmap has issues with partial offloading on metal
for _, g := range gpus {
if g.Library == "metal" &&
if g.Library == "Metal" &&
uint64(s.options.NumGPU) > 0 &&
uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
s.options.UseMMap = new(bool)
@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
(s.options.UseMMap != nil && !*s.options.UseMMap) {
@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
}
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err
return nil, err
}
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil {
return err
return nil, err
}
// On the Ollama engine, we can print out a summary of the memory allocations.
@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if !resp.Success {
slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
return errors.New("failed to allocate memory for model")
return nil, errors.New("failed to allocate memory for model")
}
// The llama engine does its memory allocations together with model loading, so we
// need to wait until it is done to ensure that we have accurate memory data before
// loading the next model
if s.textProcessor == nil {
return s.WaitUntilRunning(ctx)
return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
} else {
return nil
return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
}
}
@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
gpuLayers := make(ml.GPULayersList, len(gpus))
for i := range gpuLayers {
gpuLayers[i].ID = gpus[i].ID
gpuLayers[i].DeviceID = gpus[i].DeviceID
}
var sum float32
@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
//
// This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
// allowing for faster iteration, but may return less information.
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
//
// Returns the list of GPU IDs that were used in the final allocation on success
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
var success bool
defer func() {
if !success {
@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
available = 0
}
slog.Info("gpu memory", "id", gpu.ID,
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
"available", format.HumanBytes2(available),
"free", format.HumanBytes2(gpu.FreeMemory),
"minimum", format.HumanBytes2(gpu.MinimumMemory),
@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil {
return err
return nil, err
}
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
return err
return nil, err
}
nextOperation:
@@ -710,7 +712,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, operation)
if err != nil {
return err
return nil, err
}
resp.Memory.Log(slog.LevelDebug)
@@ -722,7 +724,7 @@ nextOperation:
for {
newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
if err != nil {
return err
return nil, err
}
slog.Debug("new layout created", "layers", newGPULayers)
@@ -756,7 +758,7 @@ nextOperation:
newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
s.options.NumGPU = -1
if err != nil {
return err
return nil, err
}
slog.Debug("new layout created", "layers", newGPULayers)
@@ -764,7 +766,7 @@ nextOperation:
s.loadRequest.GPULayers = newGPULayers
resp, err = s.initModel(ctx, s.loadRequest, operation)
if err != nil {
return err
return nil, err
}
resp.Memory.Log(slog.LevelDebug)
@@ -773,7 +775,7 @@ nextOperation:
if resp.Success {
verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
if err != nil {
return err
return nil, err
}
slog.Debug("verifying layout", "layers", verifyGPULayers)
@@ -798,7 +800,7 @@ nextOperation:
}
if s.options.NumGPU >= 0 {
return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
}
// Memory allocation failed even though we created a layout that we thought should
@@ -808,7 +810,7 @@ nextOperation:
// space.
if backoff > 1 {
slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
return errors.New("memory layout cannot be allocated")
return nil, errors.New("memory layout cannot be allocated")
} else if backoff == 0 {
backoff = 0.01
} else {
@@ -823,7 +825,7 @@ nextOperation:
s.loadRequest.GPULayers = gpuLayers
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
if err != nil {
return err
return nil, err
}
success = resp.Success
@@ -831,10 +833,27 @@ nextOperation:
if !success {
slog.Warn("failed to commit memory for model", "memory", resp.Memory)
return errors.New("failed to commit memory for model")
return nil, errors.New("failed to commit memory for model")
}
return nil
return uniqueDeviceIDs(gpuLayers), nil
}
func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
devices := []ml.DeviceID{}
for _, layer := range gpuLayers {
new := true
for _, ID := range devices {
if layer.DeviceID == ID {
new = false
break
}
}
if new {
devices = append(devices, layer.DeviceID)
}
}
return devices
}
// createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
for i := range gl {
found := false
for j := range memory.GPUs {
if gl[i].ID == memory.GPUs[j].ID {
if gl[i].DeviceID == memory.GPUs[j].DeviceID {
if memory.GPUs[j].Graph != 0 {
lastUsedGPU = i
}
@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
gl[i].FreeMemory = 0
}
slog.Debug("available gpu", "id", gl[i].ID,
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
var vramSize uint64
for _, gl := range gpuLayers {
for _, gpu := range memory.GPUs {
if gl.ID == gpu.ID {
if gl.DeviceID == gpu.DeviceID {
vramSize += gpu.Graph
break
}
@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
device := len(gpus) - 1
gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
for i := len(layers) - 1; i >= 0; i-- {
if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
if device < 0 {
return gpuLayers
}
gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
}
}
@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int {
return -1
}
func (s *llmServer) GetPort() int {
return s.port
}
func (s *llmServer) HasExited() bool {
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
return true
}
return false
}
var grammarJSON = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -1386,7 +1416,7 @@ type CompletionResponse struct {
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
logutil.Trace("completion request", "prompt", req.Prompt)
if len(req.Format) > 0 {
switch string(req.Format) {
@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct {
}
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
logutil.Trace("embedding request", "input", input)
if err := s.sem.Acquire(ctx, 1); err != nil {
if errors.Is(err, context.Canceled) {
@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
return s.estimate.TotalSize
}
func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
if gpu.DeviceID == id {
if i < len(s.estimate.GPUSizes) {
return s.estimate.GPUSizes[i]
}
@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
return 0
}
func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
slog.Debug("llamarunner free vram reporting not supported")
return nil
}
func (s *ollamaServer) VRAMSize() uint64 {
if s.mem == nil {
return 0
@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 {
return mem
}
func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
if s.mem == nil {
return 0
}
for _, g := range s.mem.GPUs {
if g.ID == gpuID {
if g.DeviceID == id {
return g.Size()
}
}
return 0
}
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
devices, err := discover.GetDevicesFromRunner(ctx, s)
if err != nil {
if s.cmd != nil && s.cmd.ProcessState == nil {
// Still running but hit an error, log
slog.Debug("failure refreshing GPU information", "error", err)
}
// else no longer running so suppress logging as a failure is expected
}
return devices
}