From 12a8b00b34dfd64b3f59234b3ae4d0ba4a0e0937 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Mon, 10 Mar 2025 13:34:02 -0700 Subject: [PATCH] server: allow running embed models in parallel The ability to run embedding models in parallel with other types of models was removed due to limitations in server slot loading in a past version of the server. This slot loading system is no longer used, and embedding models can run in parallel with chat models. --- server/sched.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/server/sched.go b/server/sched.go index b4600dbf7..091833c7c 100644 --- a/server/sched.go +++ b/server/sched.go @@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // Embedding models should always be loaded with parallel=1 - if pending.model.CheckCapabilities(CapabilityCompletion) != nil { - numParallel = 1 - } - // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode