From 26c2e0bd35feb7f958924269ccfba6331a1dadbc Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 26 Feb 2025 14:17:08 -0800 Subject: [PATCH] ml/backend/ggml: handle user specified cpu offloading --- ml/backend/ggml/ggml.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index f60619454..5c1e55b5f 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -67,7 +67,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { "num_key_values", len(meta.KV()), ) - type dbt struct { + type deviceBufferType struct { d *C.struct_ggml_backend_device bts []*C.struct_ggml_backend_buffer_type } @@ -96,7 +96,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { var sum uint64 var cumsum []uint64 - var gpuBufferTypes []dbt + var gpuDeviceBufferTypes []deviceBufferType for _, d := range gpus { var free, total C.size_t C.ggml_backend_dev_memory(d, &free, &total) @@ -104,7 +104,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { cumsum = append(cumsum, sum) bt := C.ggml_backend_dev_buffer_type(d) - gpuBufferTypes = append(gpuBufferTypes, dbt{ + gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{ d: d, bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...), }) @@ -115,7 +115,8 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { splits[i] = float64(cumsum[i]) / float64(sum) } - input := dbt{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes} + cpuDeviceBufferTypes := deviceBufferType{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes} + input := cpuDeviceBufferTypes var blocks int for key, value := range meta.KV() { @@ -124,18 +125,22 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) { } } - indexFunc := func(i int) func(float64) bool { - return func(f float64) bool { - return float64(i)/float64(blocks+1) < f + assignLayer := func(i int) (temp deviceBufferType) { + if i >= params.NumGPULayers { + return cpuDeviceBufferTypes } + + return gpuDeviceBufferTypes[slices.IndexFunc(splits, func(f float64) bool { + return float64(i)/float64(blocks+1) < f + })] } - layers := make([]dbt, blocks) + layers := make([]deviceBufferType, blocks) for i := range layers { - layers[i] = gpuBufferTypes[slices.IndexFunc(splits, indexFunc(i))] + layers[i] = assignLayer(i) } - output := gpuBufferTypes[slices.IndexFunc(splits, indexFunc(blocks))] + output := assignLayer(blocks) maxTensors := len(meta.Tensors().Items()) maxTensors += 1