perf: build graph for next batch async to keep GPU busy (#11863)

* perf: build graph for next batch in parallel to keep GPU busy This refactors the main run loop of the ollama runner to perform the main GPU intensive tasks (Compute+Floats) in a go routine so we can prepare the next batch in parallel to reduce the amount of time the GPU stalls waiting for the next batch of work. * tests: tune integration tests for ollama engine This tunes the integration tests to focus more on models supported by the new engine.
2025-11-10 20:17:59 +01:00 · 2025-08-29 14:20:28 -07:00
parent ead4a9a1d0
commit 517807cdf2
20 changed files with 591 additions and 235 deletions
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -9,7 +9,6 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
-	"github.com/stretchr/testify/require"
 )

 func TestVisionModels(t *testing.T) {
@@ -32,7 +31,9 @@ func TestVisionModels(t *testing.T) {
 	for _, v := range testCases {
 		t.Run(v.model, func(t *testing.T) {
 			image, err := base64.StdEncoding.DecodeString(imageEncoding)
-			require.NoError(t, err)
+			if err != nil {
+				t.Fatal(err)
+			}
 			req := api.GenerateRequest{
 				Model:  v.model,
 				Prompt: "what does the text in this image say?",
@@ -52,7 +53,9 @@ func TestVisionModels(t *testing.T) {
 			// Note: sometimes it returns "the ollamas" sometimes "the ollams"
 			resp := "the ollam"
 			defer cleanup()
-			require.NoError(t, PullIfMissing(ctx, client, req.Model))
+			if err := PullIfMissing(ctx, client, req.Model); err != nil {
+				t.Fatal(err)
+			}
 			// llava models on CPU can be quite slow to start
 			DoGenerate(ctx, t, client, req, []string{resp}, 240*time.Second, 30*time.Second)
 		})
@@ -62,7 +65,9 @@ func TestVisionModels(t *testing.T) {
 func TestIntegrationSplitBatch(t *testing.T) {
 	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
-	require.NoError(t, err)
+	if err != nil {
+		t.Fatal(err)
+	}
 	req := api.GenerateRequest{
 		Model: "gemma3:4b",
 		// Fill up a chunk of the batch so the image will partially spill over into the next one
@@ -84,7 +89,9 @@ func TestIntegrationSplitBatch(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
-	require.NoError(t, PullIfMissing(ctx, client, req.Model))
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatal(err)
+	}
 	// llava models on CPU can be quite slow to start,
 	DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second)
 }