From 791650ddef9eb11e011506dbd5d22ed6bfcb6a10 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 11 Jul 2024 00:53:12 -0700 Subject: [PATCH] sched: only error when over-allocating system memory (#5626) --- llm/server.go | 9 +++++++++ server/sched.go | 37 ------------------------------------- 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/llm/server.go b/llm/server.go index aa504d193..07c58cfff 100644 --- a/llm/server.go +++ b/llm/server.go @@ -122,6 +122,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } + // On linux, over-allocating CPU memory will almost always result in an error + if runtime.GOOS == "linux" { + systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize + if systemMemoryRequired > systemTotalMemory { + slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory)) + return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory)) + } + } + estimate.log() // Loop through potential servers diff --git a/server/sched.go b/server/sched.go index 48047bfec..2daed3abb 100644 --- a/server/sched.go +++ b/server/sched.go @@ -135,11 +135,6 @@ func (s *Scheduler) processPending(ctx context.Context) { } for { - cpus := s.getCpuFn() - var systemMem gpu.GpuInfo - if len(cpus) > 0 { - systemMem = cpus[0] - } var runnerToExpire *runnerRef s.loadedMu.Lock() runner := s.loaded[pending.model.ModelPath] @@ -193,38 +188,6 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) - maxSize := systemMem.FreeMemory - - // Add available GPU memory to the total pool - // macOS hardware has unified memory so don't double count - if runtime.GOOS != "darwin" { - for _, gpu := range gpus { - if gpu.Library == "cpu" { - continue - } - if loadedCount == 0 { - // If no other models are loaded, set the limit based on what's available - maxSize += gpu.FreeMemory - } else { - // Other models could be unloaded, favor total memory for limit - maxSize += gpu.TotalMemory - } - } - } - - // Block attempting to load a model larger than system memory + GPU memory - if estimate.TotalSize > maxSize { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) - - // Linux will crash if over-allocating memory - return an error to the user. - // TODO (jmorganca): add reasonable upper limits for darwin and windows as well - if runtime.GOOS == "linux" { - pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) - break - } - } - // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode