test: harden scheduler tests (#12662)

* test: harden scheduler tests

This removes reschedDelay which was stale code, and adds
a new configurable timeout for the waitForVRAMRecovery so
tests can now set the timeout to be very short to avoid the
scheduler getting stuck and hitting a test timeout.

* test: tune tests for partial loads

Give stress tests more time when the model is split between CPU/GPU
This commit is contained in:
Daniel Hiltgen
2025-10-17 08:56:44 -07:00
committed by GitHub
parent 270679932f
commit 68e04c7ff8
10 changed files with 195 additions and 143 deletions

View File

@@ -109,6 +109,8 @@ func TestMultiModelStress(t *testing.T) {
defer cancel()
client, _, cleanup := InitServerConnection(ctx, t)
defer cleanup()
initialTimeout := 120 * time.Second
streamTimeout := 20 * time.Second
// Make sure all the models are pulled before we get started
for _, model := range chosenModels {
@@ -147,6 +149,8 @@ chooseModels:
for _, m := range models.Models {
if m.SizeVRAM == 0 {
slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
initialTimeout = 240 * time.Second
streamTimeout = 30 * time.Second
break chooseModels
}
}
@@ -172,10 +176,7 @@ chooseModels:
k := r.Int() % len(reqs)
reqs[k].Model = chosenModels[i]
slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
DoChat(ctx, t, client, reqs[k], resps[k],
120*time.Second, // Be extra patient for the model to load initially
10*time.Second, // Once results start streaming, fail if they stall
)
DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout)
}
}(i)
}