sched: logging improvements (#10550)

This enhances our logging in the scheduler. The initial "waiting for server" log no longer claims an initial error state (now "not responding" which better reflects the actual state). Runners now have slog wiring to report more details about the runner, including PID.
2025-12-10 02:42:26 +01:00 · 2025-05-03 12:01:56 -07:00
parent dd1d4e99e7
commit 76ea735aaf
3 changed files with 57 additions and 14 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -44,6 +44,7 @@ type LlamaServer interface {
 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
 	EstimatedVRAMByGPU(gpuID string) uint64
+	Pid() int
 }

 // llmServer is an instance of the llama.cpp server
@@ -520,6 +521,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if errors.Is(err, context.DeadlineExceeded) {
 			return ServerStatusNotResponding, errors.New("server not responding")
 		}
+		if strings.Contains(err.Error(), "connection refused") {
+			return ServerStatusNotResponding, errors.New("connection refused")
+		}
 		return ServerStatusError, fmt.Errorf("health resp: %w", err)
 	}
 	defer resp.Body.Close()
@@ -640,6 +644,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	}
 }

+func (s *llmServer) Pid() int {
+	if s.cmd != nil && s.cmd.Process != nil {
+		return s.cmd.Process.Pid
+	}
+	return -1
+}
+
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws