mirror of
https://github.com/ollama/ollama.git
synced 2025-04-02 09:00:28 +02:00
server: allow running embed models in parallel
The ability to run embedding models in parallel with other types of models was removed due to limitations in server slot loading in a past version of the server. This slot loading system is no longer used, and embedding models can run in parallel with chat models.
This commit is contained in:
parent
d8a5d96b98
commit
12a8b00b34
@ -194,11 +194,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
break
|
||||
}
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
// simplifying assumption of defaultParallel when in CPU mode
|
||||
|
Loading…
x
Reference in New Issue
Block a user