ggml: Disable unused pipeline parallelism

We're not currently using it, even in cases where we could. Disabling
it improves generation performance by 10-30% with multiple GPUs.
This commit is contained in:
Jesse Gross
2025-07-10 16:55:34 -07:00
committed by Jesse Gross
parent f8a6e88819
commit 9a43994c45

View File

@@ -418,7 +418,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
C.int(len(schedBackends)),
C.size_t(maxGraphNodes),
C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
C._Bool(false),
C._Bool(false),
),
schedBackends: schedBackends,