perf: build graph for next batch async to keep GPU busy (#11863)

* perf: build graph for next batch in parallel to keep GPU busy This refactors the main run loop of the ollama runner to perform the main GPU intensive tasks (Compute+Floats) in a go routine so we can prepare the next batch in parallel to reduce the amount of time the GPU stalls waiting for the next batch of work. * tests: tune integration tests for ollama engine This tunes the integration tests to focus more on models supported by the new engine.
2025-12-09 18:02:43 +01:00 · 2025-08-29 14:20:28 -07:00
parent ead4a9a1d0
commit 517807cdf2
20 changed files with 591 additions and 235 deletions
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -14,8 +14,6 @@ import (
 	"testing"
 	"time"

-	"github.com/stretchr/testify/require"
-
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
@@ -79,21 +77,21 @@ func TestMultiModelStress(t *testing.T) {
 		t.Fatal(err)
 	}

+	// All models compatible with ollama-engine
 	smallModels := []string{
 		"llama3.2:1b",
 		"qwen3:0.6b",
-		"gemma:2b",
-		"deepseek-r1:1.5b",
-		"starcoder2:3b",
+		"gemma2:2b",
+		"deepseek-r1:1.5b", // qwen2 arch
+		"gemma3:270m",
 	}
 	mediumModels := []string{
-		"qwen3:8b",
-		"llama2",
-		"deepseek-r1:7b",
-		"mistral",
-		"dolphin-mistral",
-		"gemma:7b",
-		"codellama:7b",
+		"llama3.2:3b",    // ~3.4G
+		"qwen3:8b",       // ~6.6G
+		"gpt-oss:20b",    // ~15G
+		"deepseek-r1:7b", // ~5.6G
+		"gemma3:4b",      // ~5.8G
+		"gemma2:9b",      // ~8.1G
 	}

 	var chosenModels []string
@@ -114,7 +112,9 @@ func TestMultiModelStress(t *testing.T) {

 	// Make sure all the models are pulled before we get started
 	for _, model := range chosenModels {
-		require.NoError(t, PullIfMissing(ctx, client, model))
+		if err := PullIfMissing(ctx, client, model); err != nil {
+			t.Fatal(err)
+		}
 	}

 	// Determine how many models we can load in parallel before we exceed VRAM