Rename gpu package discover (#7143)

Cleaning up go package naming
2025-07-01 17:00:59 +02:00 · 2024-10-16 17:45:00 -07:00
parent 7d6eb0d4c3
commit 05cd82ef94
33 changed files with 94 additions and 94 deletions
--- a/server/routes.go
+++ b/server/routes.go
@ -27,8 +27,8 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {

 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
-	gpus := gpu.GetGPUInfo()
+	gpus := discover.GetGPUInfo()
 	gpus.LogDetails()

 	err = srvr.Serve(ln)
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -15,7 +15,7 @@ import (
 	"github.com/google/go-cmp/cmp"

 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/llm"
 )

@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 }

-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return mock, nil
 	}
 }
@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
--- a/server/sched.go
+++ b/server/sched.go
@ -15,9 +15,9 @@ import (
 	"time"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 )

@ -41,10 +41,10 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex

-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
-	getGpuFn     func() gpu.GpuInfoList
-	getCpuFn     func() gpu.GpuInfoList
+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	getGpuFn     func() discover.GpuInfoList
+	getCpuFn     func() discover.GpuInfoList
 	reschedDelay time.Duration
 }

@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		unloadedCh:    make(chan interface{}, maxQueue),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
-		getGpuFn:      gpu.GetGPUInfo,
-		getCpuFn:      gpu.GetCPUInfo,
+		getGpuFn:      discover.GetGPUInfo,
+		getCpuFn:      discover.GetCPUInfo,
 		reschedDelay:  250 * time.Millisecond,
 	}
 	sched.loadFn = sched.load
@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
-					var gpus gpu.GpuInfoList
+					var gpus discover.GpuInfoList
 					if pending.opts.NumGPU == 0 {
 						gpus = s.getCpuFn()
 					} else {
@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 }

-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 		numParallel = 1
 	}
@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	}()
 }

-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 	type predKey struct {
 		Library string
 		ID      string
@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
 // This routine returns the set of GPUs that do not have an active loading model.
 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
-	ret := append(gpu.GpuInfoList{}, allGpus...)
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
+	ret := append(discover.GpuInfoList{}, allGpus...)
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
 	for _, runner := range s.loaded {
@ -541,8 +541,8 @@ type runnerRef struct {
 	// unloading bool      // set to true when we are trying to unload the runner

 	llama          llm.LlamaServer
-	loading        bool            // True only during initial load, then false forever
-	gpus           gpu.GpuInfoList // Recorded at time of provisioning
+	loading        bool                 // True only during initial load, then false forever
+	gpus           discover.GpuInfoList // Recorded at time of provisioning
 	estimatedVRAM  uint64
 	estimatedTotal uint64

@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	start := time.Now()

 	// Establish a baseline before we unload
-	gpusBefore := gpu.GetGPUInfo()
+	gpusBefore := discover.GetGPUInfo()
 	var totalMemoryBefore, freeMemoryBefore uint64
 	for _, gpu := range gpusBefore {
 		totalMemoryBefore += gpu.TotalMemory
@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 			}

 			// Query GPUs, look for free to go back up
-			gpusNow := gpu.GetGPUInfo()
+			gpusNow := discover.GetGPUInfo()
 			var totalMemoryNow, freeMemoryNow uint64
 			for _, gpu := range gpusNow {
 				totalMemoryNow += gpu.TotalMemory
@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64

 	var numParallelToTry []int
@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL

 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
-		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)

 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))

 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
-						return []gpu.GpuInfo{g}
+						return []discover.GpuInfo{g}
 					}
 				}
 			}
@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 }

 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	if *numParallel <= 0 {
 		*numParallel = 1
 		req.opts.NumCtx = req.origNumCtx
@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {

 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -13,8 +13,8 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 )

@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	// Fail to load model first
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 	}
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	s.load(req, ggml, gpus, 0)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")

 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
@ -102,7 +102,7 @@ type reqBundle struct {
 	ggml    *llm.GGML
 }

-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 }

@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	return b
 }

-func getGpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "metal"}
+func getGpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "metal"}
 	g.TotalMemory = 24 * format.GigaByte
 	g.FreeMemory = 12 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }

-func getCpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "cpu"}
+func getCpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "cpu"}
 	g.TotalMemory = 32 * format.GigaByte
 	g.FreeMemory = 26 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }

 func TestRequestsSameModelSameRequest(t *testing.T) {
@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
 	}

 	var ggml *llm.GGML
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
 	// Same model, same request
 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
 	s := InitScheduler(ctx)
-	s.getGpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "metal"}
+	s.getGpuFn = func() discover.GpuInfoList {
+		g := discover.GpuInfo{Library: "metal"}
 		g.TotalMemory = 24 * format.GigaByte
 		g.FreeMemory = 12 * format.GigaByte
-		return []gpu.GpuInfo{g}
+		return []discover.GpuInfo{g}
 	}
 	s.newServerFn = scenario1a.newServer
 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
 func TestUpdateFreeSpace(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 			Library: "a",
 			ID:      "1",
@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 			Library: "cuda",
 			ID:      "0",
@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 			ID:      "1",
 		},
 	}
-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}

 	s := InitScheduler(ctx)
 	s.loadedMu.Lock()
@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	require.Len(t, tmp, 1)
 	require.Equal(t, "1", tmp[0].ID)

-	r1.gpus = gpu.GpuInfoList{gpus[1]}
+	r1.gpus = discover.GpuInfoList{gpus[1]}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 1)
 	require.Equal(t, "0", tmp[0].ID)

-	r1.gpus = gpu.GpuInfoList{}
+	r1.gpus = discover.GpuInfoList{}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 2)
 }
@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 	defer done()
 	s := InitScheduler(ctx)

-	s.getGpuFn = func() gpu.GpuInfoList {
+	s.getGpuFn = func() discover.GpuInfoList {
 		// Set memory values to require the model to be spread
-		gpus := []gpu.GpuInfo{
+		gpus := []discover.GpuInfo{
 			{Library: "cuda"},
 			{Library: "rocm"},
 		}
@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 	}
 	s.getCpuFn = getCpuFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		require.Len(t, gpus, 1)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 	}