mirror of
https://github.com/ollama/ollama.git
synced 2025-04-04 01:50:26 +02:00
ml/backend/ggml: use backend buffer type
this ensures the tensor is created on the right buffer type for backends such as cpu
This commit is contained in:
parent
45df786f09
commit
b27e8f3f10
@ -44,13 +44,13 @@ type Backend struct {
|
||||
tensors map[string]*C.struct_ggml_tensor
|
||||
|
||||
// input is the backend used for inputs
|
||||
input *C.struct_ggml_backend
|
||||
input *C.struct_ggml_backend_buffer_type
|
||||
|
||||
// output is the backend used for outputs
|
||||
output *C.struct_ggml_backend
|
||||
output *C.struct_ggml_backend_buffer_type
|
||||
|
||||
// layers is the backend used for repeating layers
|
||||
layers map[int]*C.struct_ggml_backend
|
||||
layers map[int]*C.struct_ggml_backend_buffer_type
|
||||
|
||||
flashAttention bool
|
||||
|
||||
@ -83,7 +83,10 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||
for _, d := range devices() {
|
||||
switch C.ggml_backend_dev_type(d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
cpus = append(cpus, d)
|
||||
if len(cpus) == 0 {
|
||||
// only the first cpu device should be used
|
||||
cpus = append(cpus, d)
|
||||
}
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
accels = append(accels, d)
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
@ -324,25 +327,25 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// map devices to backends so tensors created post initialization can be assigned to the correct device
|
||||
deviceBackends := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend)
|
||||
// map devices to backend buffer types so new tensors can be assigned to the correct device
|
||||
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
|
||||
|
||||
// create backends and buffer types used for the compute graph scheduler
|
||||
var schedBackends []*C.struct_ggml_backend
|
||||
var schedBufts []*C.struct_ggml_backend_buffer_type
|
||||
for _, d := range append(gpus, append(accels, cpus...)...) {
|
||||
b := C.ggml_backend_dev_init(d, nil)
|
||||
schedBackends = append(schedBackends, b)
|
||||
deviceBackends[d] = b
|
||||
|
||||
bt := C.ggml_backend_get_default_buffer_type(b)
|
||||
// use the first gpu host buffer type for gpu if possible
|
||||
if d := C.ggml_backend_get_device(b); C.ggml_backend_dev_type(d) == C.GGML_BACKEND_DEVICE_TYPE_CPU && len(gpus) > 0 {
|
||||
if hbt := C.ggml_backend_dev_host_buffer_type(d); hbt != nil {
|
||||
// use the first gpu host buffer type for gpu if possible
|
||||
if hbt := C.ggml_backend_dev_host_buffer_type(gpus[0]); hbt != nil {
|
||||
bt = hbt
|
||||
}
|
||||
}
|
||||
|
||||
deviceBufferTypes[d] = bt
|
||||
|
||||
schedBackends = append(schedBackends, b)
|
||||
schedBufts = append(schedBufts, bt)
|
||||
|
||||
slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(b)), "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
|
||||
@ -365,12 +368,12 @@ func New(r *os.File, params ml.BackendParams) (ml.Backend, error) {
|
||||
C.size_t(maxGraphNodes),
|
||||
true,
|
||||
),
|
||||
input: deviceBackends[input.d],
|
||||
output: deviceBackends[output.d],
|
||||
layers: func() map[int]*C.struct_ggml_backend {
|
||||
m := make(map[int]*C.struct_ggml_backend)
|
||||
input: deviceBufferTypes[input.d],
|
||||
output: deviceBufferTypes[output.d],
|
||||
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
|
||||
m := make(map[int]*C.struct_ggml_backend_buffer_type)
|
||||
for i, layer := range layers {
|
||||
m[i] = deviceBackends[layer.d]
|
||||
m[i] = deviceBufferTypes[layer.d]
|
||||
}
|
||||
return m
|
||||
}(),
|
||||
@ -401,13 +404,12 @@ func (b *Backend) NewContext() ml.Context {
|
||||
func (b *Backend) NewContextSize(n int) ml.Context {
|
||||
n = min(n, b.maxGraphNodes)
|
||||
return &Context{
|
||||
b: b,
|
||||
b: b,
|
||||
maxGraphNodes: n,
|
||||
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
|
||||
no_alloc: true,
|
||||
}),
|
||||
backend: C.ggml_backend_sched_get_backend(b.sched, 0),
|
||||
maxGraphNodes: n,
|
||||
}
|
||||
}
|
||||
|
||||
@ -425,8 +427,8 @@ type Context struct {
|
||||
ctx *C.struct_ggml_context
|
||||
graph *C.struct_ggml_cgraph
|
||||
|
||||
// backend is the backend used for new tensors
|
||||
backend *C.struct_ggml_backend
|
||||
// buft is the buffer type used for new tensors
|
||||
buft *C.struct_ggml_backend_buffer_type
|
||||
|
||||
// maxGraphNodes is the maximum allowed number of graph nodes in this context
|
||||
maxGraphNodes int
|
||||
@ -437,7 +439,7 @@ func (c Context) Input() ml.Context {
|
||||
return &Context{
|
||||
b: c.b,
|
||||
ctx: c.ctx,
|
||||
backend: c.b.input,
|
||||
buft: c.b.input,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
}
|
||||
}
|
||||
@ -450,7 +452,7 @@ func (c Context) Output() ml.Context {
|
||||
return &Context{
|
||||
b: c.b,
|
||||
ctx: c.ctx,
|
||||
backend: c.b.output,
|
||||
buft: c.b.output,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
}
|
||||
}
|
||||
@ -459,11 +461,11 @@ func (c Context) Output() ml.Context {
|
||||
}
|
||||
|
||||
func (c Context) Layer(i int) ml.Context {
|
||||
if backend, ok := c.b.layers[i]; ok {
|
||||
if buft, ok := c.b.layers[i]; ok {
|
||||
return &Context{
|
||||
b: c.b,
|
||||
ctx: c.ctx,
|
||||
backend: backend,
|
||||
buft: buft,
|
||||
maxGraphNodes: c.maxGraphNodes,
|
||||
}
|
||||
}
|
||||
@ -516,6 +518,10 @@ func shapeToGGML(shape []int) *C.int64_t {
|
||||
}
|
||||
|
||||
func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
if c.buft == nil {
|
||||
panic("set Input, Output, or Layer before creating tensors")
|
||||
}
|
||||
|
||||
var cdtype uint32
|
||||
switch dtype {
|
||||
case ml.DTypeF32:
|
||||
@ -542,7 +548,7 @@ func (c Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
|
||||
}
|
||||
|
||||
t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
|
||||
b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
|
||||
b := C.ggml_backend_buft_alloc_buffer(c.buft, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
return &Tensor{b: c.b, t: t}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user