mirror of
https://github.com/ollama/ollama.git
synced 2025-11-10 23:57:32 +01:00
DRY out the runner lifecycle code (#12540)
* DRY out the runner lifecycle code Now that discovery uses the runners as well, this unifies the runner spawning code into a single place. This also unifies GPU discovery types with the newer ml.DeviceInfo * win: make incremental builds better Place build artifacts in discrete directories so incremental builds don't have to start fresh * Adjust sort order to consider iGPUs * handle cpu inference oom scenarios * review comments
This commit is contained in:
@@ -13,7 +13,6 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/app/lifecycle"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
@@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) {
|
||||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := discover.GpuInfoList{}
|
||||
s.load(req, f, gpus, false)
|
||||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
require.Empty(t, req.successCh)
|
||||
require.Len(t, req.errCh, 1)
|
||||
s.loadedMu.Lock()
|
||||
@@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) {
|
||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.NoError(t, err)
|
||||
@@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) {
|
||||
|
||||
req.model.ModelPath = "dummy_model_path"
|
||||
server.waitResp = errors.New("wait failure")
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
require.Contains(t, err.Error(), "wait failure")
|
||||
@@ -106,7 +106,7 @@ type reqBundle struct {
|
||||
f *ggml.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
scenario.srv.modelPath = model
|
||||
return scenario.srv, nil
|
||||
}
|
||||
@@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra
|
||||
return b
|
||||
}
|
||||
|
||||
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
slog.Info("test getGpuFn called", "runners", runners)
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
|
||||
func getCpuFn() discover.GpuInfo {
|
||||
slog.Info("test getCpuFn called")
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
|
||||
g.TotalMemory = 32 * format.GigaByte
|
||||
g.FreeMemory = 26 * format.GigaByte
|
||||
return g
|
||||
func getSystemInfoFn() ml.SystemInfo {
|
||||
slog.Info("test getSystemInfoFn called")
|
||||
return ml.SystemInfo{
|
||||
TotalMemory: 32 * format.GigaByte,
|
||||
FreeMemory: 26 * format.GigaByte,
|
||||
}
|
||||
}
|
||||
|
||||
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||
@@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
|
||||
b.req.model = a.req.model
|
||||
@@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
|
||||
tmpModel := *a.req.model
|
||||
@@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
a.ctxDone()
|
||||
// Report recovered VRAM usage
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
slog.Info("XXX altered getGpuFn called")
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
slog.Info("altered getGpuFn called")
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 24 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
select {
|
||||
case resp := <-b.req.successCh:
|
||||
@@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
slog.Info("TestRequestsMultipleLoadedModels")
|
||||
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn // 1 metal GPU
|
||||
s.getCpuFn = getCpuFn // 1 CPU
|
||||
s.getGpuFn = getGpuFn // 1 Metal GPU
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
|
||||
// Multiple loaded models
|
||||
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
|
||||
a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
|
||||
a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
|
||||
b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
|
||||
b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
|
||||
c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
|
||||
c.req.opts.NumGPU = 0 // CPU load, will be allowed
|
||||
b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c
|
||||
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
||||
d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
|
||||
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
slog.Info("Loading A")
|
||||
s.pendingReqCh <- a.req
|
||||
s.Run(ctx)
|
||||
select {
|
||||
@@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
||||
s.newServerFn = b.newServer
|
||||
slog.Info("b")
|
||||
slog.Info("Loading B")
|
||||
s.pendingReqCh <- b.req
|
||||
select {
|
||||
case resp := <-b.req.successCh:
|
||||
@@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
|
||||
// This is a CPU load with NumGPU = 0 so it should load
|
||||
s.newServerFn = c.newServer
|
||||
slog.Info("c")
|
||||
slog.Info("Loading C")
|
||||
s.pendingReqCh <- c.req
|
||||
select {
|
||||
case resp := <-c.req.successCh:
|
||||
@@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
case err := <-c.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
s.loadedMu.Lock()
|
||||
@@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
|
||||
b.ctxDone()
|
||||
// Report recovered VRAM usage so scheduler will finish waiting and unload
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
|
||||
g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 24 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
return []ml.DeviceInfo{g}
|
||||
}
|
||||
select {
|
||||
case resp := <-d.req.successCh:
|
||||
@@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) {
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getCpuFn = getCpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
s.newServerFn = a.newServer
|
||||
slog.Info("a")
|
||||
successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
|
||||
@@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||
}
|
||||
|
||||
var f *ggml.GGML
|
||||
gpus := discover.GpuInfoList{}
|
||||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, f, gpus, false)
|
||||
s.load(req, f, systemInfo, gpus, false)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
@@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) {
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestSchedPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
|
||||
scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
|
||||
g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
|
||||
g.TotalMemory = 24 * format.GigaByte
|
||||
g.FreeMemory = 12 * format.GigaByte
|
||||
return []discover.GpuInfo{g}
|
||||
}
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
s.newServerFn = scenario1a.newServer
|
||||
successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
@@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) {
|
||||
func TestSchedUpdateFreeSpace(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := discover.GpuInfoList{
|
||||
gpus := []ml.DeviceInfo{
|
||||
{
|
||||
DeviceID: ml.DeviceID{
|
||||
ID: "1",
|
||||
@@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string {
|
||||
return s.modelPath
|
||||
}
|
||||
|
||||
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||
if requireFull {
|
||||
if len(gpus) == 0 {
|
||||
slog.Info("mockLlm.Load CPU based load")
|
||||
return nil, nil
|
||||
}
|
||||
for _, g := range gpus {
|
||||
if g.FreeMemory >= s.vramSize {
|
||||
return []ml.DeviceID{g.DeviceID}, nil
|
||||
|
||||
Reference in New Issue
Block a user