From 9a43994c45f8da1b21fd302d5ef000cee36c4e16 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 10 Jul 2025 16:55:34 -0700 Subject: [PATCH] ggml: Disable unused pipeline parallelism We're not currently using it, even in cases where we could. Disabling it improves generation performance by 10-30% with multiple GPUs. --- ml/backend/ggml/ggml.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 680910f8dd..7d6831eed0 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -418,7 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])), C.int(len(schedBackends)), C.size_t(maxGraphNodes), - C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)), + C._Bool(false), C._Bool(false), ), schedBackends: schedBackends,