perf: build graph for next batch async to keep GPU busy (#11863)

* perf: build graph for next batch in parallel to keep GPU busy

This refactors the main run loop of the ollama runner to perform the main GPU
intensive tasks (Compute+Floats) in a go routine so we can prepare the next
batch in parallel to reduce the amount of time the GPU stalls waiting for the
next batch of work.

* tests: tune integration tests for ollama engine

This tunes the integration tests to focus more on models supported
by the new engine.
This commit is contained in:
Daniel Hiltgen
2025-08-29 14:20:28 -07:00
committed by GitHub
parent ead4a9a1d0
commit 517807cdf2
20 changed files with 591 additions and 235 deletions

View File

@@ -11,7 +11,6 @@ import (
"time"
"github.com/ollama/ollama/api"
"github.com/stretchr/testify/require"
)
func TestBlueSky(t *testing.T) {
@@ -37,8 +36,8 @@ func TestUnicode(t *testing.T) {
// Set up the test data
req := api.GenerateRequest{
// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K",
Prompt: "天空为什么是蓝色的?",
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K", // TODO is there an ollama-engine model we can switch to and keep the coverage?
Prompt: "天空为什么是蓝色的?", // Why is the sky blue?
Stream: &stream,
Options: map[string]any{
"temperature": 0,
@@ -50,8 +49,20 @@ func TestUnicode(t *testing.T) {
}
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
DoGenerate(ctx, t, client, req, []string{"散射", "频率"}, 120*time.Second, 120*time.Second)
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatal(err)
}
slog.Info("loading", "model", req.Model)
err := client.Generate(ctx, &api.GenerateRequest{Model: req.Model}, func(response api.GenerateResponse) error { return nil })
if err != nil {
t.Fatalf("failed to load model %s: %s", req.Model, err)
}
skipIfNotGPULoaded(ctx, t, client, req.Model, 100)
DoGenerate(ctx, t, client, req, []string{
"散射", // scattering
"频率", // frequency
}, 120*time.Second, 120*time.Second)
}
func TestExtendedUnicodeOutput(t *testing.T) {
@@ -69,7 +80,9 @@ func TestExtendedUnicodeOutput(t *testing.T) {
}
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
require.NoError(t, PullIfMissing(ctx, client, req.Model))
if err := PullIfMissing(ctx, client, req.Model); err != nil {
t.Fatal(err)
}
DoGenerate(ctx, t, client, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"}, 120*time.Second, 120*time.Second)
}
@@ -84,7 +97,9 @@ func TestUnicodeModelDir(t *testing.T) {
}
modelDir, err := os.MkdirTemp("", "ollama_埃")
require.NoError(t, err)
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(modelDir)
slog.Info("unicode", "OLLAMA_MODELS", modelDir)