test: harden scheduler tests (#12662)

* test: harden scheduler tests This removes reschedDelay which was stale code, and adds a new configurable timeout for the waitForVRAMRecovery so tests can now set the timeout to be very short to avoid the scheduler getting stuck and hitting a test timeout. * test: tune tests for partial loads Give stress tests more time when the model is split between CPU/GPU
2025-11-11 05:07:57 +01:00 · 2025-10-17 08:56:44 -07:00
parent 270679932f
commit 68e04c7ff8
10 changed files with 195 additions and 143 deletions
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -109,6 +109,8 @@ func TestMultiModelStress(t *testing.T) {
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
+	initialTimeout := 120 * time.Second
+	streamTimeout := 20 * time.Second

 	// Make sure all the models are pulled before we get started
 	for _, model := range chosenModels {
@@ -147,6 +149,8 @@ chooseModels:
 			for _, m := range models.Models {
 				if m.SizeVRAM == 0 {
 					slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
+					initialTimeout = 240 * time.Second
+					streamTimeout = 30 * time.Second
 					break chooseModels
 				}
 			}
@@ -172,10 +176,7 @@ chooseModels:
 				k := r.Int() % len(reqs)
 				reqs[k].Model = chosenModels[i]
 				slog.Info("Starting", "model", reqs[k].Model, "iteration", j, "request", reqs[k].Messages[0].Content)
-				DoChat(ctx, t, client, reqs[k], resps[k],
-					120*time.Second, // Be extra patient for the model to load initially
-					10*time.Second,  // Once results start streaming, fail if they stall
-				)
+				DoChat(ctx, t, client, reqs[k], resps[k], initialTimeout, streamTimeout)
 			}
 		}(i)
 	}