From 01aa7887221e7bd286ebcb14a088c94ba1c22a99 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Thu, 27 Mar 2025 11:52:09 -0700
Subject: [PATCH] ml: Remove Output from Context interface

Model implementations should use Input for all of their tensors
supplied to the model. This includes tensors that relate to the
outputs, which is confusing since there is also an Output funciton.

Since Output is only used internally in GGML and not used by any
model implementations, we can remove it from the interface to
reduce confusion.
---
 kvcache/causal_test.go  |  1 -
 ml/backend.go           |  6 ++----
 ml/backend/ggml/ggml.go | 19 +------------------
 3 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/kvcache/causal_test.go b/kvcache/causal_test.go
index 617f53635..b1dc7d779 100644
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -362,7 +362,6 @@ func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
 }
 
 func (c *testContext) Input() ml.Context    { return c }
-func (c *testContext) Output() ml.Context   { return c }
 func (c *testContext) Layer(int) ml.Context { return c }
 
 func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
diff --git a/ml/backend.go b/ml/backend.go
index 354faf432..cfb18d6a9 100644
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -110,12 +110,10 @@ type Context interface {
 	MaxGraphNodes() int
 	Close()
 
-	// Input returns a context appropriate for creating input tensors
+	// Input returns a context appropriate for creating tensors that are
+	// inputs to the model (which includes things like output locations)
 	Input() Context
 
-	// Output returns a context appropriate for creating output tensors
-	Output() Context
-
 	// Layer returns a context appropriate for creating intermediate tensors
 	Layer(int) Context
 }
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index f6b017748..b6f59ae0e 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -48,9 +48,6 @@ type Backend struct {
 	// input is the backend used for inputs
 	input *C.struct_ggml_backend_buffer_type
 
-	// output is the backend used for outputs
-	output *C.struct_ggml_backend_buffer_type
-
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type
 
@@ -400,8 +397,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.size_t(maxGraphNodes),
 			C._Bool(len(gpus) > 1 && slices.Contains(gpus, output.d)),
 		),
-		input:  deviceBufferTypes[input.d],
-		output: deviceBufferTypes[output.d],
+		input: deviceBufferTypes[input.d],
 		layers: func() map[int]*C.struct_ggml_backend_buffer_type {
 			m := make(map[int]*C.struct_ggml_backend_buffer_type)
 			for i, layer := range layers {
@@ -482,19 +478,6 @@ func (c Context) Input() ml.Context {
 	return &c
 }
 
-func (c Context) Output() ml.Context {
-	if c.b.output != nil {
-		return &Context{
-			b:             c.b,
-			ctx:           c.ctx,
-			buft:          c.b.output,
-			maxGraphNodes: c.maxGraphNodes,
-		}
-	}
-
-	return &c
-}
-
 func (c Context) Layer(i int) ml.Context {
 	if buft, ok := c.b.layers[i]; ok {
 		return &Context{