tests: reduce stress on CPU to 2 models (#12161)

* tests: reduce stress on CPU to 2 models

This should avoid flakes due to systems getting overloaded with 3 (or more) models running concurrently

* tests: allow slow systems to pass on timeout

If a slow system is still streaming a response, and the response
will pass validation, don't fail just because the system is slow.

* test: unload embedding models more quickly
This commit is contained in:
Daniel Hiltgen
2025-09-09 09:32:15 -07:00
committed by GitHub
parent f810ec741c
commit 6745182885
3 changed files with 55 additions and 30 deletions

View File

@@ -121,6 +121,7 @@ func TestMultiModelStress(t *testing.T) {
// The intent is to go 1 over what can fit so we force the scheduler to thrash
targetLoadCount := 0
slog.Info("Loading models to find how many can fit in VRAM before overflowing")
chooseModels:
for i, model := range chosenModels {
req := &api.GenerateRequest{Model: model}
slog.Info("loading", "model", model)
@@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) {
slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
break
}
// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
for _, m := range models.Models {
if m.SizeVRAM == 0 {
slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
break chooseModels
}
}
}
}
if targetLoadCount == len(chosenModels) {