mirror of
https://github.com/ollama/ollama.git
synced 2025-12-09 18:02:43 +01:00
perf: build graph for next batch async to keep GPU busy (#11863)
* perf: build graph for next batch in parallel to keep GPU busy This refactors the main run loop of the ollama runner to perform the main GPU intensive tasks (Compute+Floats) in a go routine so we can prepare the next batch in parallel to reduce the amount of time the GPU stalls waiting for the next batch of work. * tests: tune integration tests for ollama engine This tunes the integration tests to focus more on models supported by the new engine.
This commit is contained in:
@@ -14,8 +14,6 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
@@ -79,21 +77,21 @@ func TestMultiModelStress(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// All models compatible with ollama-engine
|
||||
smallModels := []string{
|
||||
"llama3.2:1b",
|
||||
"qwen3:0.6b",
|
||||
"gemma:2b",
|
||||
"deepseek-r1:1.5b",
|
||||
"starcoder2:3b",
|
||||
"gemma2:2b",
|
||||
"deepseek-r1:1.5b", // qwen2 arch
|
||||
"gemma3:270m",
|
||||
}
|
||||
mediumModels := []string{
|
||||
"qwen3:8b",
|
||||
"llama2",
|
||||
"deepseek-r1:7b",
|
||||
"mistral",
|
||||
"dolphin-mistral",
|
||||
"gemma:7b",
|
||||
"codellama:7b",
|
||||
"llama3.2:3b", // ~3.4G
|
||||
"qwen3:8b", // ~6.6G
|
||||
"gpt-oss:20b", // ~15G
|
||||
"deepseek-r1:7b", // ~5.6G
|
||||
"gemma3:4b", // ~5.8G
|
||||
"gemma2:9b", // ~8.1G
|
||||
}
|
||||
|
||||
var chosenModels []string
|
||||
@@ -114,7 +112,9 @@ func TestMultiModelStress(t *testing.T) {
|
||||
|
||||
// Make sure all the models are pulled before we get started
|
||||
for _, model := range chosenModels {
|
||||
require.NoError(t, PullIfMissing(ctx, client, model))
|
||||
if err := PullIfMissing(ctx, client, model); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Determine how many models we can load in parallel before we exceed VRAM
|
||||
|
||||
Reference in New Issue
Block a user