mirror of
https://github.com/ollama/ollama.git
synced 2025-08-25 09:51:25 +02:00
ggml: Disable unused pipeline parallelism
We're not currently using it, even in cases where we could. Disabling it improves generation performance by 10-30% with multiple GPUs.
This commit is contained in:
@@ -418,7 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
||||
C.int(len(schedBackends)),
|
||||
C.size_t(maxGraphNodes),
|
||||
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
|
||||
C._Bool(false),
|
||||
C._Bool(false),
|
||||
),
|
||||
schedBackends: schedBackends,
|
||||
|
Reference in New Issue
Block a user