backend: Support graph computation that does not return an output

There are two cases where we may not have an output after computing:
 - Prompt processing where the length of the input exceeds the batch
   size
 - Internal memory management operations such as cache defrag and shift
This commit is contained in:
Jesse Gross 2025-02-03 19:35:12 -08:00 committed by Jesse Gross
parent 0e38297f87
commit 4d4463b2bd
3 changed files with 22 additions and 14 deletions

View File

@ -49,7 +49,7 @@ type Context interface {
FromIntSlice(s []int32, shape ...int) (Tensor, error)
Forward(Tensor)
Compute(Tensor) Tensor
Compute(...Tensor)
Close()
}

View File

@ -23,7 +23,7 @@ import (
"github.com/ollama/ollama/ml"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
)
type device struct {
@ -243,15 +243,17 @@ func (c *Context) Forward(t ml.Tensor) {
C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
}
func (c *Context) Compute(t ml.Tensor) ml.Tensor {
c.Forward(t)
func (c *Context) Compute(tensors ...ml.Tensor) {
C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
for _, t := range tensors {
if C.ggml_nbytes(t.(*Tensor).t) != 0 {
backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
return t
t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
}
}
}
func shapeToGGML(shape []int) *C.int64_t {
@ -292,6 +294,13 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
n := len(s)
if n == 0 {
var shape C.int64_t = 0
t := C.ggml_new_tensor(ctx.ctx, dtype, 1, &shape)
return &Tensor{t: t}, nil
}
for _, v := range shape {
n /= v
}
@ -351,11 +360,7 @@ func (t *Tensor) Shape() []int {
}
func (t *Tensor) Bytes() []byte {
if bts := C.ggml_get_data(t.t); bts != nil {
return C.GoBytes(bts, C.int(C.ggml_nbytes(t.t)))
}
return nil
return t.data
}
func (t *Tensor) Floats() (f32s []float32) {

View File

@ -275,5 +275,8 @@ func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
}
defer ctx.Close()
return ctx.Compute(t), nil
ctx.Forward(t)
ctx.Compute(t)
return t, nil
}