diff --git a/server/sched.go b/server/sched.go index 64ed20dd0..fa034d28d 100644 --- a/server/sched.go +++ b/server/sched.go @@ -123,36 +123,35 @@ func (s *Scheduler) processPending(ctx context.Context) { pending.useLoadedRunner(runner, s.finishedReqCh) break } - } else if loadedCount == 0 { - slog.Debug("loading first model", "model", pending.model.ModelPath) - gpus := s.getGpuFn() - - ggml, err := llm.LoadModel(pending.model.ModelPath) - if err != nil { - pending.errCh <- err - break - } - g := pickBestFitGPUs(pending, ggml, gpus) - if g != nil { - gpus = g - } - s.loadFn(pending, ggml, gpus) - break } else if loadedMax > 0 && loadedCount >= loadedMax { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload(pending) } else { - // More than one loaded model, so we have to see if the new one fits + // Either no models are loaded or below loadedMax // Get a refreshed GPU list gpus := s.getGpuFn() - // Update free memory from currently loaded models - s.updateFreeSpace(gpus) + // Load model for fitting ggml, err := llm.LoadModel(pending.model.ModelPath) if err != nil { pending.errCh <- err break } + + // No models loaded. Load the model but prefer the best fit. + if loadedCount == 0 { + slog.Debug("loading first model", "model", pending.model.ModelPath) + g := pickBestFitGPUs(pending, ggml, gpus) + if g != nil { + gpus = g + } + s.loadFn(pending, ggml, gpus) + break + } + + // More than one loaded model, so we have to see if the new one fits + // Update free memory from currently loaded models + s.updateFreeSpace(gpus) gpus = pickBestFitGPUs(pending, ggml, gpus) if gpus != nil { slog.Debug("new model fits with existing models, loading") diff --git a/server/sched_test.go b/server/sched_test.go index e451d84ad..3b06e2bab 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -47,7 +47,7 @@ func TestLoad(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond) defer done() s := InitScheduler(ctx) - ggml := nil // value not used in tests + var ggml *llm.GGML // value not used in tests req := &LlmRequest{ ctx: ctx, model: &Model{ModelPath: "foo"},