From 12a8b00b34dfd64b3f59234b3ae4d0ba4a0e0937 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 10 Mar 2025 13:34:02 -0700
Subject: [PATCH] server: allow running embed models in parallel

The ability to run embedding models in parallel with other types of models
was removed due to limitations in server slot loading in a past version of
the server. This slot loading system is no longer used, and embedding models
can run in parallel with chat models.
---
 server/sched.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index b4600dbf7..091833c7c 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 
-					// Embedding models should always be loaded with parallel=1
-					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
-						numParallel = 1
-					}
-
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode