mirror of
https://github.com/ollama/ollama.git
synced 2025-06-24 21:21:02 +02:00
ml/backend/ggml: handle user specified cpu offloading
This commit is contained in:
parent
bf920883d5
commit
26c2e0bd35
@ -67,7 +67,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
"num_key_values", len(meta.KV()),
|
"num_key_values", len(meta.KV()),
|
||||||
)
|
)
|
||||||
|
|
||||||
type dbt struct {
|
type deviceBufferType struct {
|
||||||
d *C.struct_ggml_backend_device
|
d *C.struct_ggml_backend_device
|
||||||
bts []*C.struct_ggml_backend_buffer_type
|
bts []*C.struct_ggml_backend_buffer_type
|
||||||
}
|
}
|
||||||
@ -96,7 +96,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
var sum uint64
|
var sum uint64
|
||||||
var cumsum []uint64
|
var cumsum []uint64
|
||||||
|
|
||||||
var gpuBufferTypes []dbt
|
var gpuDeviceBufferTypes []deviceBufferType
|
||||||
for _, d := range gpus {
|
for _, d := range gpus {
|
||||||
var free, total C.size_t
|
var free, total C.size_t
|
||||||
C.ggml_backend_dev_memory(d, &free, &total)
|
C.ggml_backend_dev_memory(d, &free, &total)
|
||||||
@ -104,7 +104,7 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
cumsum = append(cumsum, sum)
|
cumsum = append(cumsum, sum)
|
||||||
|
|
||||||
bt := C.ggml_backend_dev_buffer_type(d)
|
bt := C.ggml_backend_dev_buffer_type(d)
|
||||||
gpuBufferTypes = append(gpuBufferTypes, dbt{
|
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
|
||||||
d: d,
|
d: d,
|
||||||
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
|
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuBufferTypes...),
|
||||||
})
|
})
|
||||||
@ -115,7 +115,8 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
splits[i] = float64(cumsum[i]) / float64(sum)
|
splits[i] = float64(cumsum[i]) / float64(sum)
|
||||||
}
|
}
|
||||||
|
|
||||||
input := dbt{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
|
cpuDeviceBufferTypes := deviceBufferType{C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU), cpuBufferTypes}
|
||||||
|
input := cpuDeviceBufferTypes
|
||||||
|
|
||||||
var blocks int
|
var blocks int
|
||||||
for key, value := range meta.KV() {
|
for key, value := range meta.KV() {
|
||||||
@ -124,18 +125,22 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
indexFunc := func(i int) func(float64) bool {
|
assignLayer := func(i int) (temp deviceBufferType) {
|
||||||
return func(f float64) bool {
|
if i >= params.NumGPULayers {
|
||||||
|
return cpuDeviceBufferTypes
|
||||||
|
}
|
||||||
|
|
||||||
|
return gpuDeviceBufferTypes[slices.IndexFunc(splits, func(f float64) bool {
|
||||||
return float64(i)/float64(blocks+1) < f
|
return float64(i)/float64(blocks+1) < f
|
||||||
}
|
})]
|
||||||
}
|
}
|
||||||
|
|
||||||
layers := make([]dbt, blocks)
|
layers := make([]deviceBufferType, blocks)
|
||||||
for i := range layers {
|
for i := range layers {
|
||||||
layers[i] = gpuBufferTypes[slices.IndexFunc(splits, indexFunc(i))]
|
layers[i] = assignLayer(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
output := gpuBufferTypes[slices.IndexFunc(splits, indexFunc(blocks))]
|
output := assignLayer(blocks)
|
||||||
|
|
||||||
maxTensors := len(meta.Tensors().Items())
|
maxTensors := len(meta.Tensors().Items())
|
||||||
maxTensors += 1
|
maxTensors += 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user