server: allow running embed models in parallel

The ability to run embedding models in parallel with other types of models
was removed due to limitations in server slot loading in a past version of
the server. This slot loading system is no longer used, and embedding models
can run in parallel with chat models.
This commit is contained in:
Bruce MacDonald 2025-03-10 13:34:02 -07:00
parent d8a5d96b98
commit 12a8b00b34

View File

@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
numParallel = 1
}
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode