sched: logging improvements (#10550)

This enhances our logging in the scheduler.  The initial "waiting for server" log
no longer claims an initial error state (now "not responding" which better reflects
the actual state).  Runners now have slog wiring to report more details about the
runner, including PID.
This commit is contained in:
Daniel Hiltgen
2025-05-03 12:01:56 -07:00
committed by GitHub
parent dd1d4e99e7
commit 76ea735aaf
3 changed files with 57 additions and 14 deletions

View File

@@ -44,6 +44,7 @@ type LlamaServer interface {
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
Pid() int
}
// llmServer is an instance of the llama.cpp server
@@ -520,6 +521,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
if errors.Is(err, context.DeadlineExceeded) {
return ServerStatusNotResponding, errors.New("server not responding")
}
if strings.Contains(err.Error(), "connection refused") {
return ServerStatusNotResponding, errors.New("connection refused")
}
return ServerStatusError, fmt.Errorf("health resp: %w", err)
}
defer resp.Body.Close()
@@ -640,6 +644,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
}
}
func (s *llmServer) Pid() int {
if s.cmd != nil && s.cmd.Process != nil {
return s.cmd.Process.Pid
}
return -1
}
var grammarJSON = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws