diff --git a/discover/cpu_linux_test.go b/discover/cpu_linux_test.go index 3a5144780c..7ff34df0d8 100644 --- a/discover/cpu_linux_test.go +++ b/discover/cpu_linux_test.go @@ -2065,12 +2065,6 @@ power management: cpus := linuxCPUDetails(buf) slog.Info("example", "scenario", k, "cpus", cpus) - si := SystemInfo{ - System: CPUInfo{ - CPUs: cpus, - }, - } - threadCount := si.GetOptimalThreadCount() if len(v.expCPUs) != len(cpus) { t.Fatalf("incorrect number of sockets: expected:%v got:%v", v.expCPUs, cpus) } @@ -2085,10 +2079,6 @@ power management: t.Fatalf("incorrect number of threads: expected:%v got:%v", v.expCPUs[i], c) } } - - if threadCount != v.expThreadCount { - t.Fatalf("incorrect thread count expected:%d got:%d", v.expThreadCount, threadCount) - } }) } } diff --git a/discover/gpu.go b/discover/gpu.go index 2f394fdf86..927aed2a23 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -1,16 +1,13 @@ package discover import ( - "context" "log/slog" "os" - "path/filepath" "regexp" "runtime" "strconv" "strings" - "github.com/ollama/ollama/format" "github.com/ollama/ollama/ml" ) @@ -18,159 +15,28 @@ import ( // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") -func GetCPUInfo() GpuInfo { - mem, err := GetCPUMem() +// GetSystemInfo returns the last cached state of the GPUs on the system +func GetSystemInfo() ml.SystemInfo { + memInfo, err := GetCPUMem() if err != nil { slog.Warn("error looking up system memory", "error", err) } - - return GpuInfo{ - memInfo: mem, - DeviceID: ml.DeviceID{ - Library: "cpu", - ID: "0", - }, - } -} - -func GetGPUInfo(ctx context.Context, runners []FilteredRunnerDiscovery) GpuInfoList { - devs := GPUDevices(ctx, runners) - return devInfoToInfoList(devs) -} - -func devInfoToInfoList(devs []ml.DeviceInfo) GpuInfoList { - resp := []GpuInfo{} - // Our current packaging model places ggml-hip in the main directory - // but keeps rocm in an isolated directory. We have to add it to - // the [LD_LIBRARY_]PATH so ggml-hip will load properly - rocmDir := filepath.Join(LibOllamaPath, "rocm") - if _, err := os.Stat(rocmDir); err != nil { - rocmDir = "" + var threadCount int + cpus := GetCPUDetails() + for _, c := range cpus { + threadCount += c.CoreCount - c.EfficiencyCoreCount } - for _, dev := range devs { - info := GpuInfo{ - DeviceID: dev.DeviceID, - filterID: dev.FilteredID, - Name: dev.Description, - memInfo: memInfo{ - TotalMemory: dev.TotalMemory, - FreeMemory: dev.FreeMemory, - }, - // TODO can we avoid variant - DependencyPath: dev.LibraryPath, - DriverMajor: dev.DriverMajor, - DriverMinor: dev.DriverMinor, - ComputeMajor: dev.ComputeMajor, - ComputeMinor: dev.ComputeMinor, - } - if dev.Library == "CUDA" || dev.Library == "ROCm" { - info.MinimumMemory = 457 * format.MebiByte - } - if dev.Library == "ROCm" && rocmDir != "" { - info.DependencyPath = append(info.DependencyPath, rocmDir) - } - // TODO any special processing of Vulkan devices? - resp = append(resp, info) - } - if len(resp) == 0 { - mem, err := GetCPUMem() - if err != nil { - slog.Warn("error looking up system memory", "error", err) - } - - resp = append(resp, GpuInfo{ - memInfo: mem, - DeviceID: ml.DeviceID{ - Library: "cpu", - ID: "0", - }, - }) - } - return resp -} - -// Given the list of GPUs this instantiation is targeted for, -// figure out the visible devices environment variable -// -// If different libraries are detected, the first one is what we use -func (l GpuInfoList) GetVisibleDevicesEnv() []string { - if len(l) == 0 { - return nil - } - res := []string{} - envVar := rocmGetVisibleDevicesEnv(l) - if envVar != "" { - res = append(res, envVar) - } - envVar = vkGetVisibleDevicesEnv(l) - if envVar != "" { - res = append(res, envVar) - } - return res -} - -func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) string { - ids := []string{} - for _, info := range gpuInfo { - if info.Library != "ROCm" { - continue - } - // If the devices requires a numeric ID, for filtering purposes, we use the unfiltered ID number - if info.filterID != "" { - ids = append(ids, info.filterID) - } else { - ids = append(ids, info.ID) - } - } - if len(ids) == 0 { - return "" - } - envVar := "ROCR_VISIBLE_DEVICES=" - if runtime.GOOS != "linux" { - envVar = "HIP_VISIBLE_DEVICES=" - } - // There are 3 potential env vars to use to select GPUs. - // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows - // HIP_VISIBLE_DEVICES supports numeric IDs only - // GPU_DEVICE_ORDINAL supports numeric IDs only - return envVar + strings.Join(ids, ",") -} - -func vkGetVisibleDevicesEnv(gpuInfo []GpuInfo) string { - ids := []string{} - for _, info := range gpuInfo { - if info.Library != "Vulkan" { - continue - } - if info.filterID != "" { - ids = append(ids, info.filterID) - } else { - ids = append(ids, info.ID) - } - } - if len(ids) == 0 { - return "" - } - envVar := "GGML_VK_VISIBLE_DEVICES=" - return envVar + strings.Join(ids, ",") -} - -// GetSystemInfo returns the last cached state of the GPUs on the system -func GetSystemInfo() SystemInfo { - deviceMu.Lock() - defer deviceMu.Unlock() - gpus := devInfoToInfoList(devices) - if len(gpus) == 1 && gpus[0].Library == "cpu" { - gpus = []GpuInfo{} + if threadCount == 0 { + // Fall back to Go's num CPU + threadCount = runtime.NumCPU() } - return SystemInfo{ - System: CPUInfo{ - CPUs: GetCPUDetails(), - GpuInfo: GetCPUInfo(), - }, - GPUs: gpus, + return ml.SystemInfo{ + ThreadCount: threadCount, + TotalMemory: memInfo.TotalMemory, + FreeMemory: memInfo.FreeMemory, + FreeSwap: memInfo.FreeSwap, } } diff --git a/discover/runner.go b/discover/runner.go index 66c3e3e624..cbaba3c602 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -4,13 +4,8 @@ package discover import ( "context" - "encoding/json" - "fmt" "io" "log/slog" - "math/rand" - "net" - "net/http" "os" "os/exec" "path/filepath" @@ -23,6 +18,7 @@ import ( "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" ) @@ -36,7 +32,7 @@ var ( bootstrapped bool ) -func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.DeviceInfo { +func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo { deviceMu.Lock() defer deviceMu.Unlock() startDiscovery := time.Now() @@ -154,9 +150,9 @@ func GPUDevices(ctx context.Context, runners []FilteredRunnerDiscovery) []ml.Dev slog.Error("Unknown Library:" + devices[i].Library) } - extraEnvs := []string{ - "GGML_CUDA_INIT=1", // force deep initialization to trigger crash on unsupported GPUs - envVar + "=" + id, // Filter to just this one GPU + extraEnvs := map[string]string{ + "GGML_CUDA_INIT": "1", // force deep initialization to trigger crash on unsupported GPUs + envVar: id, // Filter to just this one GPU } if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { needsDelete[i] = true @@ -449,100 +445,35 @@ func (r *bootstrapRunner) HasExited() bool { return false } -func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []string) []ml.DeviceInfo { - // TODO DRY out with llm/server.go - slog.Debug("spawning runner with", "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs) +func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs map[string]string) []ml.DeviceInfo { + var out io.Writer + if envconfig.LogLevel() == logutil.LevelTrace { + out = os.Stderr + } start := time.Now() defer func() { slog.Debug("bootstrap discovery took", "duration", time.Since(start), "OLLAMA_LIBRARY_PATH", ollamaLibDirs, "extra_envs", extraEnvs) }() - port := 0 - if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { - var l *net.TCPListener - if l, err = net.ListenTCP("tcp", a); err == nil { - port = l.Addr().(*net.TCPAddr).Port - l.Close() - } - } - if port == 0 { - slog.Debug("ResolveTCPAddr failed, using random port") - port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range - } - params := []string{"runner", "--ollama-engine", "--port", strconv.Itoa(port)} - var pathEnv string - switch runtime.GOOS { - case "windows": - pathEnv = "PATH" - case "darwin": - pathEnv = "DYLD_LIBRARY_PATH" - default: - pathEnv = "LD_LIBRARY_PATH" - } - libraryPaths := append([]string{LibOllamaPath}, ollamaLibDirs...) - if rocmDir != "" { - libraryPaths = append(libraryPaths, rocmDir) - } - // Note: we always put our dependency paths first - // since these are the exact version we compiled/linked against - if libraryPath, ok := os.LookupEnv(pathEnv); ok { - libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) - } - cmd := exec.Command(exe, params...) - cmd.Env = os.Environ() - if envconfig.LogLevel() == logutil.LevelTrace { - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - } - - // cmd.SysProcAttr = llm.LlamaServerSysProcAttr // circular dependency - bring back once refactored - pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) - pathNeeded := true - ollamaPathNeeded := true - extraDone := make([]bool, len(extraEnvs)) - for i := range cmd.Env { - cmp := strings.SplitN(cmd.Env[i], "=", 2) - if strings.EqualFold(cmp[0], pathEnv) { - cmd.Env[i] = pathEnv + "=" + pathEnvVal - pathNeeded = false - } else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") { - cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ollamaLibDirs, string(filepath.ListSeparator)) - ollamaPathNeeded = false - } else { - for j := range extraEnvs { - if extraDone[j] { - continue - } - extra := strings.SplitN(extraEnvs[j], "=", 2) - if cmp[0] == extra[0] { - cmd.Env[i] = extraEnvs[j] - extraDone[j] = true - } - } - } - } - if pathNeeded { - cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal) - } - if ollamaPathNeeded { - cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ollamaLibDirs, string(filepath.ListSeparator))) - } - for i := range extraDone { - if !extraDone[i] { - cmd.Env = append(cmd.Env, extraEnvs[i]) - } - } - logutil.Trace("starting runner for device discovery", "env", cmd.Env, "cmd", cmd) - if err := cmd.Start(); err != nil { - slog.Warn("unable to start discovery subprocess", "cmd", cmd, "error", err) + logutil.Trace("starting runner for device discovery", "libDirs", ollamaLibDirs, "extraEnvs", extraEnvs) + cmd, port, err := llm.StartRunner( + true, // ollama engine + "", // no model + ollamaLibDirs, + out, + extraEnvs, + ) + if err != nil { + slog.Debug("failed to start runner to discovery GPUs", "error", err) return nil } + go func() { cmd.Wait() // exit status ignored }() defer cmd.Process.Kill() - devices, err := GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd}) + devices, err := ml.GetDevicesFromRunner(ctx, &bootstrapRunner{port: port, cmd: cmd}) if err != nil { if cmd.ProcessState != nil && cmd.ProcessState.ExitCode() >= 0 { // Expected during bootstrapping while we filter out unsupported AMD GPUs @@ -555,52 +486,3 @@ func bootstrapDevices(ctx context.Context, ollamaLibDirs []string, extraEnvs []s return devices } - -func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]ml.DeviceInfo, error) { - var moreDevices []ml.DeviceInfo - port := runner.GetPort() - tick := time.Tick(10 * time.Millisecond) - for { - select { - case <-ctx.Done(): - return nil, fmt.Errorf("failed to finish discovery before timeout") - case <-tick: - r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil) - if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) - } - r.Header.Set("Content-Type", "application/json") - - resp, err := http.DefaultClient.Do(r) - if err != nil { - // slog.Warn("failed to send request", "error", err) - if runner.HasExited() { - return nil, fmt.Errorf("runner crashed") - } - continue - } - defer resp.Body.Close() - - if resp.StatusCode == http.StatusNotFound { - // old runner, fall back to bootstrapping model - return nil, fmt.Errorf("llamarunner free vram reporting not supported") - } - - body, err := io.ReadAll(resp.Body) - if err != nil { - slog.Warn("failed to read response", "error", err) - continue - } - if resp.StatusCode != 200 { - logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body) - return nil, fmt.Errorf("runner error: %s", string(body)) - } - - if err := json.Unmarshal(body, &moreDevices); err != nil { - slog.Warn("unmarshal encode response", "error", err) - continue - } - return moreDevices, nil - } - } -} diff --git a/discover/types.go b/discover/types.go index adb2f43a74..b34bafd2de 100644 --- a/discover/types.go +++ b/discover/types.go @@ -1,10 +1,8 @@ package discover import ( - "context" "log/slog" "path/filepath" - "runtime" "strings" "github.com/ollama/ollama/format" @@ -17,50 +15,6 @@ type memInfo struct { FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only } -// Beginning of an `ollama info` command -type GpuInfo struct { // TODO better name maybe "InferenceProcessor"? - ml.DeviceID - memInfo - - // Optional variant to select (e.g. versions, cpu feature flags) - Variant string `json:"variant"` - - // MinimumMemory represents the minimum memory required to use the GPU - MinimumMemory uint64 `json:"-"` - - // Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly - DependencyPath []string `json:"lib_path,omitempty"` - - // Set to true if we can NOT reliably discover FreeMemory. A value of true indicates - // the FreeMemory is best effort, and may over or under report actual memory usage - // False indicates FreeMemory can generally be trusted on this GPU - UnreliableFreeMemory bool - - // GPU information - filterID string // AMD/Vulkan Workaround: The numeric ID of the device used to filter out other devices - Name string `json:"name"` // user friendly name if available - ComputeMajor int `json:"compute_major"` // Compute Capability or gfx - ComputeMinor int `json:"compute_minor"` - - // Driver Information - TODO no need to put this on each GPU - DriverMajor int `json:"driver_major,omitempty"` - DriverMinor int `json:"driver_minor,omitempty"` - - // TODO other performance capability info to help in scheduling decisions -} - -func (gpu GpuInfo) RunnerName() string { - if gpu.Variant != "" { - return gpu.Library + "_" + gpu.Variant - } - return gpu.Library -} - -type CPUInfo struct { - GpuInfo - CPUs []CPU -} - // CPU type represents a CPU Package occupying a socket type CPU struct { ID string `cpuinfo:"processor"` @@ -71,32 +25,6 @@ type CPU struct { ThreadCount int } -type GpuInfoList []GpuInfo - -func (l GpuInfoList) ByLibrary() []GpuInfoList { - resp := []GpuInfoList{} - libs := []string{} - for _, info := range l { - found := false - requested := info.Library - if info.Variant != "" { - requested += "_" + info.Variant - } - for i, lib := range libs { - if lib == requested { - resp[i] = append(resp[i], info) - found = true - break - } - } - if !found { - libs = append(libs, requested) - resp = append(resp, []GpuInfo{info}) - } - } - return resp -} - func LogDetails(devices []ml.DeviceInfo) { for _, dev := range devices { var libs []string @@ -141,74 +69,3 @@ func LogDetails(devices []ml.DeviceInfo) { ) } } - -// Sort by Free Space -type ByFreeMemory []GpuInfo - -func (a ByFreeMemory) Len() int { return len(a) } -func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory } - -type SystemInfo struct { - System CPUInfo `json:"system"` - GPUs []GpuInfo `json:"gpus"` -} - -// Return the optimal number of threads to use for inference -func (si SystemInfo) GetOptimalThreadCount() int { - if len(si.System.CPUs) == 0 { - // Fall back to Go's num CPU - return runtime.NumCPU() - } - - coreCount := 0 - for _, c := range si.System.CPUs { - coreCount += c.CoreCount - c.EfficiencyCoreCount - } - - return coreCount -} - -// For each GPU, check if it does NOT support flash attention -func (l GpuInfoList) FlashAttentionSupported() bool { - for _, gpu := range l { - supportsFA := gpu.Library == "cpu" || - gpu.Name == "Metal" || gpu.Library == "Metal" || - (gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || // We don't have kernels for Jetson Xavier - gpu.Library == "ROCm" || - gpu.Library == "Vulkan" - - if !supportsFA { - return false - } - } - return true -} - -type BaseRunner interface { - // GetPort returns the localhost port number the runner is running on - GetPort() int - - // HasExited indicates if the runner is no longer running. This can be used during - // bootstrap to detect if a given filtered device is incompatible and triggered an assert - HasExited() bool -} - -type RunnerDiscovery interface { - BaseRunner - - // GetDeviceInfos will perform a query of the underlying device libraries - // for device identification and free VRAM information - // During bootstrap scenarios, this routine may take seconds to complete - GetDeviceInfos(ctx context.Context) []ml.DeviceInfo -} - -type FilteredRunnerDiscovery interface { - RunnerDiscovery - - // GetActiveDeviceIDs returns the filtered set of devices actively in - // use by this runner for running models. If the runner is a bootstrap runner, no devices - // will be active yet so no device IDs are returned. - // This routine will not query the underlying device and will return immediately - GetActiveDeviceIDs() []ml.DeviceID -} diff --git a/llm/memory.go b/llm/memory.go index aa4927f163..15558109f6 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -4,27 +4,28 @@ import ( "fmt" "log/slog" "os" + "slices" "sort" "strings" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" + "github.com/ollama/ollama/ml" ) // pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits // The list of GPUs returned will always be the same brand (library) // If the model can not be fit fully within the available GPU(s) nil is returned -func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { - for _, gl := range gpus.ByLibrary() { - sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) +func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo { + for _, gl := range ml.ByLibrary(gpus) { + sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups - sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) + sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl))) if !envconfig.SchedSpread() { // Try to pack into as few GPUs as possible, starting from 1 GPU @@ -63,8 +64,8 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin } // If multiple Libraries are detected, pick the Library which loads the most layers for the model -func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus discover.GpuInfoList, numParallel int) discover.GpuInfoList { - byLibrary := gpus.ByLibrary() +func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo { + byLibrary := ml.ByLibrary(gpus) if len(byLibrary) <= 1 { return gpus } @@ -81,10 +82,10 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s } // This algorithm looks for a complete fit to determine if we need to unload other models -func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { +func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { // Split up the GPUs by type and try them var estimatedVRAM uint64 - for _, gpus := range allGpus.ByLibrary() { + for _, gpus := range ml.ByLibrary(allGpus) { var layerCount int estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel) layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize @@ -97,14 +98,23 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj return true, estimatedVRAM } } - - if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory { - return true, estimatedVRAM - } } return false, estimatedVRAM } +func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool { + estimate := estimateGPULayers(nil, f, projectors, opts, numParallel) + if estimate.TotalSize > systemInfo.FreeMemory { + return false + } + slog.Info("new model will fit in available system memory for CPU inference, loading", + "model", modelPath, + "parallel", numParallel, + "required", format.HumanBytes2(estimate.TotalSize), + ) + return true +} + type MemoryEstimate struct { // How many layers we predict we can load Layers int @@ -141,7 +151,7 @@ type MemoryEstimate struct { // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { +func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { // Graph size for a partial offload, applies to all GPUs var graphPartialOffload uint64 @@ -175,10 +185,17 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin overhead := envconfig.GpuOverhead() availableList := make([]string, len(gpus)) + libraries := []string{} for i, gpu := range gpus { availableList[i] = format.HumanBytes2(gpu.FreeMemory) + if !slices.Contains(libraries, gpu.Library) { + libraries = append(libraries, gpu.Library) + } } - slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) + if len(libraries) == 0 { + libraries = []string{"cpu"} + } + slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList) for _, projector := range projectors { llamaEngineProjectorWeights += projectorMemoryRequirements(projector) @@ -196,7 +213,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) && - (discover.GpuInfoList)(gpus).FlashAttentionSupported() && + ml.FlashAttentionSupported(gpus) && f.SupportsFlashAttention() var kvct string @@ -231,7 +248,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin } // on metal there's no partial offload overhead - if gpus[0].Library == "Metal" { + if len(gpus) > 0 && gpus[0].Library == "Metal" { graphPartialOffload = graphFullOffload } else if len(gpus) > 1 { // multigpu should always use the partial graph size @@ -256,7 +273,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin gpuAllocations := make([]uint64, len(gpus)) type gs struct { i int - g *discover.GpuInfo + g *ml.DeviceInfo } gpusWithSpace := []gs{} for i := range gpus { @@ -265,19 +282,11 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin gzo = gpuZeroOverhead } // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer - if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { - var compute string - if gpus[i].Library == "ROCm" { - compute = fmt.Sprintf("gfx%x%02x", gpus[i].ComputeMajor, gpus[i].ComputeMinor) - } else { - compute = fmt.Sprintf("%d.%d", gpus[i].ComputeMajor, gpus[i].ComputeMinor) - } - + if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize { slog.Debug("gpu has too little memory to allocate any layers", "id", gpus[i].ID, "library", gpus[i].Library, - "variant", gpus[i].Variant, - "compute", compute, + "compute", gpus[i].Compute(), "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor), "name", gpus[i].Name, "total", format.HumanBytes2(gpus[i].TotalMemory), @@ -291,7 +300,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin continue } gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) - gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full + gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full } var gpuZeroID int @@ -397,7 +406,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin VRAMSize: 0, GPUSizes: []uint64{}, - inferenceLibrary: gpus[0].Library, + inferenceLibrary: strings.Join(libraries, ","), layersRequested: opts.NumGPU, layersModel: int(f.KV().BlockCount()) + 1, availableList: availableList, @@ -411,7 +420,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin projectorGraph: ollamaEngineProjectorGraph, } - if gpus[0].Library == "cpu" { + if len(gpus) == 0 { return estimate } if layerCount == 0 { diff --git a/llm/memory_test.go b/llm/memory_test.go index 553214b9e5..fce17b9c25 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -10,7 +10,7 @@ import ( "github.com/stretchr/testify/require" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml" ) @@ -54,13 +54,7 @@ func TestEstimateGPULayers(t *testing.T) { } // Simple CPU scenario - gpus := []discover.GpuInfo{ - { - DeviceID: ml.DeviceID{ - Library: "cpu", - }, - }, - } + gpus := []ml.DeviceInfo{} projectors := []string{} opts := api.DefaultOptions() t.Run("cpu", func(t *testing.T) { @@ -77,19 +71,17 @@ func TestEstimateGPULayers(t *testing.T) { memoryLayerOutput := uint64(4) // Dual CUDA scenario with asymmetry - gpuMinimumMemory := uint64(2048) - gpus = []discover.GpuInfo{ + gpuMinimumMemory := uint64(457 * format.MebiByte) + gpus = []ml.DeviceInfo{ { DeviceID: ml.DeviceID{ - Library: "cuda", + Library: "CUDA", }, - MinimumMemory: gpuMinimumMemory, }, { DeviceID: ml.DeviceID{ - Library: "cuda", + Library: "CUDA", }, - MinimumMemory: gpuMinimumMemory, }, } // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 diff --git a/llm/server.go b/llm/server.go index 6ba8f8d2f7..8e91e411ef 100644 --- a/llm/server.go +++ b/llm/server.go @@ -27,7 +27,6 @@ import ( "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" @@ -66,7 +65,7 @@ func (e filteredEnv) LogValue() slog.Value { type LlamaServer interface { ModelPath() string - Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) + Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error @@ -115,7 +114,7 @@ type llamaServer struct { llmServer ggml *ggml.GGML - gpus discover.GpuInfoList // The set of GPUs covered by the memory estimate + gpus []ml.DeviceInfo // The set of GPUs covered by the memory estimate estimate MemoryEstimate } @@ -146,7 +145,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) { } // NewLlamaServer will run a server for the given GPUs -func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { +func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) { var llamaModel *llama.Model var textProcessor model.TextProcessor var err error @@ -179,7 +178,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()} - defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount() + defaultThreads := systemInfo.ThreadCount if opts.NumThread > 0 { loadRequest.NumThreads = opts.NumThread } else if defaultThreads > 0 { @@ -200,7 +199,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a // This will disable flash attention unless all GPUs on the system support it, even if we end up selecting a subset // that can handle it. - if fa && !gpus.FlashAttentionSupported() { + if fa && !ml.FlashAttentionSupported(gpus) { slog.Warn("flash attention enabled but not supported by gpu") fa = false } @@ -227,218 +226,170 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct) } - availableLibs := make(map[string]string) - if entries, err := os.ReadDir(discover.LibOllamaPath); err == nil { - for _, entry := range entries { - availableLibs[entry.Name()] = filepath.Join(discover.LibOllamaPath, entry.Name()) - } + gpuLibs := ml.LibraryPaths(gpus) + status := NewStatusWriter(os.Stderr) + cmd, port, err := StartRunner( + textProcessor != nil, + modelPath, + gpuLibs, + status, + ml.GetVisibleDevicesEnv(gpus), + ) + + s := llmServer{ + port: port, + cmd: cmd, + status: status, + options: opts, + modelPath: modelPath, + loadRequest: loadRequest, + llamaModel: llamaModel, + llamaModelLock: &sync.Mutex{}, + textProcessor: textProcessor, + numParallel: numParallel, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: f.KV().BlockCount() + 1, + loadStart: time.Now(), + done: make(chan error, 1), } - var gpuLibs []string - for _, gpu := range gpus { - gpuLibs = append(gpuLibs, gpu.RunnerName()) - } - - requested := envconfig.LLMLibrary() - if availableLibs[requested] != "" { - slog.Info("using requested gpu library", "requested", requested) - gpuLibs = []string{requested} - } - - var compatible []string - for _, gpuLib := range gpuLibs { - var matchingLibs []string - for k := range availableLibs { - // exact match first - if k == gpuLib { - matchingLibs = append([]string{k}, matchingLibs...) - continue - } - - // then match the family (e.g. 'cuda') - if strings.Split(k, "_")[0] == strings.Split(gpuLib, "_")[0] { - matchingLibs = append(matchingLibs, k) - } - } - - if len(matchingLibs) > 0 { - compatible = append(compatible, matchingLibs[0]) - } - } - - exe, err := os.Executable() if err != nil { - return nil, fmt.Errorf("unable to lookup executable path: %w", err) + var msg string + if s.status != nil && s.status.LastErrMsg != "" { + msg = s.status.LastErrMsg + } + err := fmt.Errorf("error starting runner: %v %s", err, msg) + if llamaModel != nil { + llama.FreeModel(llamaModel) + } + return nil, err + } + + // reap subprocess when it exits + go func() { + err := s.cmd.Wait() + // Favor a more detailed message over the process exit status + if err != nil && s.status != nil && s.status.LastErrMsg != "" { + slog.Error("llama runner terminated", "error", err) + if strings.Contains(s.status.LastErrMsg, "unknown model") { + s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" + } + s.done <- errors.New(s.status.LastErrMsg) + } else { + s.done <- err + } + }() + + if textProcessor != nil { + return &ollamaServer{llmServer: s}, nil + } else { + return &llamaServer{llmServer: s, ggml: f}, nil + } +} + +func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.Writer, extraEnvs map[string]string) (cmd *exec.Cmd, port int, err error) { + var exe string + exe, err = os.Executable() + if err != nil { + return nil, 0, fmt.Errorf("unable to lookup executable path: %w", err) } if eval, err := filepath.EvalSymlinks(exe); err == nil { exe = eval } - // iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc. - // adding each library's respective path to the LD_LIBRARY_PATH, until finally running - // without any LD_LIBRARY_PATH flags - for { - port := 0 - if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { - var l *net.TCPListener - if l, err = net.ListenTCP("tcp", a); err == nil { - port = l.Addr().(*net.TCPAddr).Port - l.Close() - } - } - if port == 0 { - slog.Debug("ResolveTCPAddr failed, using random port") - port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range - } - params := []string{"runner"} - if textProcessor != nil { - // New engine - // TODO - if we have failure to load scenarios, add logic to retry with the old runner - params = append(params, "--ollama-engine") - } - params = append(params, "--model", modelPath) - params = append(params, "--port", strconv.Itoa(port)) - - var pathEnv string - switch runtime.GOOS { - case "windows": - pathEnv = "PATH" - case "darwin": - pathEnv = "DYLD_LIBRARY_PATH" - default: - pathEnv = "LD_LIBRARY_PATH" - } - - // Note: we always put our dependency paths first - // since these are the exact version we compiled/linked against - libraryPaths := []string{discover.LibOllamaPath} - if libraryPath, ok := os.LookupEnv(pathEnv); ok { - libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) - } - - ggmlPaths := []string{discover.LibOllamaPath} - for _, c := range compatible { - if libpath, ok := availableLibs[c]; ok { - slog.Debug("adding gpu library", "path", libpath) - libraryPaths = append([]string{libpath}, libraryPaths...) - ggmlPaths = append(ggmlPaths, libpath) - } - } - - for _, gpu := range gpus { - if gpu.DependencyPath != nil { - slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath) - libraryPaths = append(gpu.DependencyPath, libraryPaths...) - ggmlPaths = append(ggmlPaths, gpu.DependencyPath...) - } - } - - // finally, add the root library path - libraryPaths = append(libraryPaths, discover.LibOllamaPath) - - s := llmServer{ - port: port, - cmd: exec.Command(exe, params...), - status: NewStatusWriter(os.Stderr), - options: opts, - modelPath: modelPath, - loadRequest: loadRequest, - llamaModel: llamaModel, - llamaModelLock: &sync.Mutex{}, - textProcessor: textProcessor, - numParallel: numParallel, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: f.KV().BlockCount() + 1, - loadStart: time.Now(), - done: make(chan error, 1), - } - - s.cmd.Env = os.Environ() - s.cmd.Stdout = os.Stdout - s.cmd.Stderr = s.status - s.cmd.SysProcAttr = LlamaServerSysProcAttr - - // Always filter down the set of GPUs in case there are any unsupported devices that might crash - envWorkarounds := gpus.GetVisibleDevicesEnv() - pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) - - // Update or add the path variable with our adjusted version - pathNeeded := true - ollamaPathNeeded := true - envWorkaroundDone := make([]bool, len(envWorkarounds)) - for i := range s.cmd.Env { - cmp := strings.SplitN(s.cmd.Env[i], "=", 2) - if strings.EqualFold(cmp[0], pathEnv) { - s.cmd.Env[i] = pathEnv + "=" + pathEnvVal - pathNeeded = false - } else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") { - s.cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(ggmlPaths, string(filepath.ListSeparator)) - ollamaPathNeeded = false - } else if len(envWorkarounds) != 0 { - for j, kv := range envWorkarounds { - tmp := strings.SplitN(kv, "=", 2) - if strings.EqualFold(cmp[0], tmp[0]) { - s.cmd.Env[i] = kv - envWorkaroundDone[j] = true - } - } - } - } - if pathNeeded { - s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal) - } - if ollamaPathNeeded { - s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator))) - } - for i, done := range envWorkaroundDone { - if !done { - s.cmd.Env = append(s.cmd.Env, envWorkarounds[i]) - } - } - - slog.Info("starting runner", "cmd", s.cmd) - slog.Debug("subprocess", "", filteredEnv(s.cmd.Env)) - - if err = s.cmd.Start(); err != nil { - var msg string - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - err := fmt.Errorf("error starting runner: %v %s", err, msg) - if len(compatible) == 0 { - if llamaModel != nil { - llama.FreeModel(llamaModel) - } - return nil, err - } - - slog.Warn("unable to start runner with compatible gpu", "error", err, "compatible", compatible) - compatible = compatible[1:] - continue - } - - // reap subprocess when it exits - go func() { - err := s.cmd.Wait() - // Favor a more detailed message over the process exit status - if err != nil && s.status != nil && s.status.LastErrMsg != "" { - slog.Error("llama runner terminated", "error", err) - if strings.Contains(s.status.LastErrMsg, "unknown model") { - s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" - } - s.done <- errors.New(s.status.LastErrMsg) - } else { - s.done <- err - } - }() - - if textProcessor != nil { - return &ollamaServer{llmServer: s}, nil - } else { - return &llamaServer{llmServer: s, ggml: f}, nil + port = 0 + if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil { + var l *net.TCPListener + if l, err = net.ListenTCP("tcp", a); err == nil { + port = l.Addr().(*net.TCPAddr).Port + l.Close() } } + if port == 0 { + slog.Debug("ResolveTCPAddr failed, using random port") + port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range + } + params := []string{"runner"} + if ollamaEngine { + params = append(params, "--ollama-engine") + } + if modelPath != "" { + params = append(params, "--model", modelPath) + } + params = append(params, "--port", strconv.Itoa(port)) + + var pathEnv string + switch runtime.GOOS { + case "windows": + pathEnv = "PATH" + case "darwin": + pathEnv = "DYLD_LIBRARY_PATH" + default: + pathEnv = "LD_LIBRARY_PATH" + } + + // Note: we always put our dependency paths first + // since these are the exact version we compiled/linked against + libraryPaths := append([]string{}, gpuLibs...) + if libraryPath, ok := os.LookupEnv(pathEnv); ok { + libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) + } + + cmd = exec.Command(exe, params...) + + cmd.Env = os.Environ() + cmd.Stdout = out + cmd.Stderr = out + cmd.SysProcAttr = LlamaServerSysProcAttr + + // Always filter down the set of GPUs in case there are any unsupported devices that might crash + pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator)) + + // Update or add the path variable with our adjusted version + pathNeeded := true + ollamaPathNeeded := true + extraEnvsDone := map[string]bool{} + for k := range extraEnvs { + extraEnvsDone[k] = false + } + for i := range cmd.Env { + cmp := strings.SplitN(cmd.Env[i], "=", 2) + if strings.EqualFold(cmp[0], pathEnv) { + cmd.Env[i] = pathEnv + "=" + pathEnvVal + pathNeeded = false + } else if strings.EqualFold(cmp[0], "OLLAMA_LIBRARY_PATH") { + cmd.Env[i] = "OLLAMA_LIBRARY_PATH=" + strings.Join(gpuLibs, string(filepath.ListSeparator)) + ollamaPathNeeded = false + } else if len(extraEnvs) != 0 { + for k, v := range extraEnvs { + if strings.EqualFold(cmp[0], k) { + cmd.Env[i] = k + "=" + v + extraEnvsDone[k] = true + } + } + } + } + if pathNeeded { + cmd.Env = append(cmd.Env, pathEnv+"="+pathEnvVal) + } + if ollamaPathNeeded { + cmd.Env = append(cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(gpuLibs, string(filepath.ListSeparator))) + } + for k, done := range extraEnvsDone { + if !done { + cmd.Env = append(cmd.Env, k+"="+extraEnvs[k]) + } + } + + slog.Info("starting runner", "cmd", cmd) + slog.Debug("subprocess", "", filteredEnv(cmd.Env)) + + if err = cmd.Start(); err != nil { + return nil, 0, err + } + err = nil + return } func (s *llmServer) ModelPath() string { @@ -497,47 +448,58 @@ type LoadResponse struct { var ErrLoadRequiredFull = errors.New("unable to load full model on GPU") -func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) { - systemInfo := discover.GetSystemInfo() - systemTotalMemory := systemInfo.System.TotalMemory - systemFreeMemory := systemInfo.System.FreeMemory - systemSwapFreeMemory := systemInfo.System.FreeSwap +func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) { + systemTotalMemory := systemInfo.TotalMemory + systemFreeMemory := systemInfo.FreeMemory + systemSwapFreeMemory := systemInfo.FreeSwap slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) - g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) - if g == nil { - if !requireFull { - g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) - } else { + if len(gpus) == 0 || s.options.NumGPU == 0 { + if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) { slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) - return nil, ErrLoadRequiredFull + return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull) } + } else { + g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) + if g == nil { + if !requireFull { + g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) + } else { + slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) + return nil, ErrLoadRequiredFull + } + } + gpus = g } - gpus = g s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel) - if len(gpus) > 1 || gpus[0].Library != "cpu" { + if len(gpus) >= 1 { switch { - case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory: + case s.options.NumGPU == 0: + gpus = []ml.DeviceInfo{} + case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory: // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system s.options.NumGPU = 0 + gpus = []ml.DeviceInfo{} case gpus[0].Library != "Metal" && s.estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit - gpus = discover.GpuInfoList{discover.GetCPUInfo()} - case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu": + gpus = []ml.DeviceInfo{} + case s.options.NumGPU < 0 && s.estimate.Layers > 0: s.options.NumGPU = s.estimate.Layers } + } else { + s.options.NumGPU = 0 } // On linux and windows, over-allocating CPU memory will almost always result in an error // Darwin has fully dynamic swap so has no direct concept of free swap space if runtime.GOOS != "darwin" { systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize - available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap + available := systemInfo.FreeMemory + systemInfo.FreeSwap if systemMemoryRequired > available { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap)) + slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap)) return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available)) } } @@ -564,10 +526,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi // Windows CUDA should not use mmap for best performance // Linux with a model larger than free space, mmap leads to thrashing // For CPU loads we want the memory to be allocated, not FS cache - if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || - (runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) || - (gpus[0].Library == "cpu" && s.options.UseMMap == nil) || - (gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) || + if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || + (runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) || + (len(gpus) == 0 && s.options.UseMMap == nil) || + (len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) || (s.options.UseMMap != nil && !*s.options.UseMMap) { s.loadRequest.UseMmap = false } @@ -605,8 +567,8 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi // createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment // of particular layers onto GPUs -func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.GpuInfoList, numGPU int) ml.GPULayersList { - if numGPU <= 0 { +func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList { + if numGPU <= 0 || len(gpus) == 0 { return nil } @@ -662,7 +624,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu // allowing for faster iteration, but may return less information. // // Returns the list of GPU IDs that were used in the final allocation on success -func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) { +func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) { var success bool defer func() { if !success { @@ -675,24 +637,21 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU) - systemInfo := discover.GetSystemInfo() - systemTotalMemory := systemInfo.System.TotalMemory - systemFreeMemory := systemInfo.System.FreeMemory - systemSwapFreeMemory := systemInfo.System.FreeSwap + systemTotalMemory := systemInfo.TotalMemory + systemFreeMemory := systemInfo.FreeMemory + systemSwapFreeMemory := systemInfo.FreeSwap slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) - if !(len(gpus) == 1 && gpus[0].Library == "cpu") { - for _, gpu := range gpus { - available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory - if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory { - available = 0 - } - slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library, - "available", format.HumanBytes2(available), - "free", format.HumanBytes2(gpu.FreeMemory), - "minimum", format.HumanBytes2(gpu.MinimumMemory), - "overhead", format.HumanBytes2(envconfig.GpuOverhead())) + for _, gpu := range gpus { + available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory() + if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() { + available = 0 } + slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library, + "available", format.HumanBytes2(available), + "free", format.HumanBytes2(gpu.FreeMemory), + "minimum", format.HumanBytes2(gpu.MinimumMemory()), + "overhead", format.HumanBytes2(envconfig.GpuOverhead())) } pastAllocations := make(map[uint64]struct{}) @@ -762,7 +721,6 @@ nextOperation: if err != nil { return nil, err } - slog.Debug("new layout created", "layers", newGPULayers) s.loadRequest.GPULayers = newGPULayers @@ -864,20 +822,27 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID { // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph // - Assigning layers // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory -func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs discover.GpuInfoList, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) { - if s.totalLayers == 0 || s.options.NumGPU == 0 || len(systemGPUs) == 0 || (len(systemGPUs) == 1 && systemGPUs[0].Library == "cpu") { - return ml.GPULayersList{}, nil - } - - gpus := append(make(discover.GpuInfoList, 0, len(systemGPUs)), systemGPUs...) - sort.Sort(sort.Reverse(discover.ByFreeMemory(gpus))) - +func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) { if memory == nil { memory = &ml.BackendMemory{CPU: ml.DeviceMemory{ Weights: make([]uint64, s.totalLayers), Cache: make([]uint64, s.totalLayers), }} } + gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff) + if err != nil { + return nil, err + } + err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) + if err != nil { + return nil, err + } + return gpuLayers, nil +} + +func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) { + gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...) + sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus))) layers := make([]uint64, len(memory.CPU.Weights)) for i := range layers { @@ -891,7 +856,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d } gpuLayers := ml.GPULayersList{} - for _, gl := range gpus.ByLibrary() { + for _, gl := range ml.ByLibrary(gpus) { // If a GPU already has a graph allocated on it, then we should continue to use it. // Otherwise, we lose information that we got from previous allocations, which can // cause cycling. Plus, we get more information about required allocation from each @@ -905,7 +870,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d lastUsedGPU = i } - reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph + reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory() + envconfig.GpuOverhead() + memory.GPUs[j].Graph if gl[i].FreeMemory > reserved { gl[i].FreeMemory -= reserved } else { @@ -914,7 +879,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library, "available layer vram", format.HumanBytes2(gl[i].FreeMemory), - "backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory), + "backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory()), "overhead", format.HumanBytes2(envconfig.GpuOverhead()), "graph", format.HumanBytes2(memory.GPUs[j].Graph)) @@ -933,7 +898,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d gpuLayers = libraryGpuLayers } } + return gpuLayers, layers, nil +} +// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory +func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { // These sizes will only increase as we go through additional iterations and get additional information. cpuSize := memory.InputWeights + memory.CPU.Graph var vramSize uint64 @@ -961,24 +930,24 @@ nextLayer: if requireFull { if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { - return nil, ErrLoadRequiredFull + return ErrLoadRequiredFull } - if cpuSize > systemInfo.System.FreeMemory { - return nil, ErrLoadRequiredFull + if cpuSize > systemInfo.FreeMemory { + return ErrLoadRequiredFull } } // On linux and windows, over-allocating CPU memory will almost always result in an error // Darwin has fully dynamic swap so has no direct concept of free swap space if runtime.GOOS != "darwin" { - available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap + available := systemInfo.FreeMemory + systemInfo.FreeSwap if cpuSize > available { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap)) - return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available)) + slog.Warn("model request too large for system", "requested", format.HumanBytes2(cpuSize), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap)) + return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(cpuSize), format.HumanBytes2(available)) } } else { - if vramSize > systemInfo.System.TotalMemory { + if vramSize > systemInfo.TotalMemory { // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system s.options.NumGPU = 0 @@ -990,11 +959,11 @@ nextLayer: slog.Debug("insufficient VRAM to load any model layers") } - return gpuLayers, nil + return nil } // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment -func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { +func assignLayers(layers []uint64, gpus []ml.DeviceInfo, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { // If we can't fit everything then prefer offloading layers other than the output layer for range 2 { // requestedLayers may be -1 if nothing was requested @@ -1028,7 +997,7 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, // findBestFit binary searches to find the smallest capacity factor that can fit // the max number of layers. The capacity factor is multiplied by the free space on // each GPU and a small one will force even balancing. -func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) { +func findBestFit(layers []uint64, gpus []ml.DeviceInfo, requestedLayers int, forceRequest bool) (gpuLayers ml.GPULayersList) { var high float32 = 1 var low float32 = 0 @@ -1053,12 +1022,11 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int low = mid } } - return bestAssignments } // greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space -func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) { +func greedyFit(layers []uint64, gpus []ml.DeviceInfo, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) { device := len(gpus) - 1 gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}} freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity) @@ -1082,7 +1050,6 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity) } } - return gpuLayers } @@ -1814,7 +1781,7 @@ func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 { } func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { - devices, err := discover.GetDevicesFromRunner(ctx, s) + devices, err := ml.GetDevicesFromRunner(ctx, s) if err != nil { if s.cmd != nil && s.cmd.ProcessState == nil { // Still running but hit an error, log diff --git a/llm/server_test.go b/llm/server_test.go index bdedc960ef..2d3bf6bec3 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -8,7 +8,6 @@ import ( "testing" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/format" "github.com/ollama/ollama/ml" "golang.org/x/sync/semaphore" @@ -20,6 +19,8 @@ func TestLLMServerFitGPU(t *testing.T) { free int } + minMemory := 457 * format.MebiByte + tests := []struct { name string gpus []gpu @@ -37,91 +38,91 @@ func TestLLMServerFitGPU(t *testing.T) { }, { name: "Full single GPU", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2}}}, }, { name: "Partial single GPU", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, }, { name: "Single GPU with numGPU 1", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, }, { name: "Single GPU with numGPU 0", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 0, expected: ml.GPULayersList{}, }, { name: "Single GPU with numGPU 999", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: 999, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{0, 1, 2, 3}}}, }, { name: "Multi GPU fits on one", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1, 2}}}, }, { name: "Multi GPU split", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1, 2}}}, }, { name: "Multi GPU partial", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 1", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 2", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{256 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, numGPU: 2, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{1}}}, }, { name: "Multi GPU numGPU 999", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{256 * format.MebiByte, 256 * format.MebiByte, 50 * format.MebiByte}, numGPU: 999, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1"}, Layers: []int{0, 1}}, {DeviceID: ml.DeviceID{ID: "gpu0"}, Layers: []int{2}}}, }, { name: "Multi GPU different libraries", - gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128 * format.MebiByte}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{Library: "CUDA", ID: "gpu0"}, free: 128*format.MebiByte + minMemory}, {id: ml.DeviceID{Library: "ROCm", ID: "gpu1"}, free: 256*format.MebiByte + minMemory}}, layers: []int{128 * format.MebiByte, 128 * format.MebiByte, 50 * format.MebiByte}, numGPU: -1, expected: ml.GPULayersList{{DeviceID: ml.DeviceID{ID: "gpu1", Library: "ROCm"}, Layers: []int{0, 1}}}, }, { name: "requireFull", - gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}}, + gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256*format.MebiByte + minMemory}}, layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte}, numGPU: -1, requireFull: true, @@ -139,12 +140,12 @@ func TestLLMServerFitGPU(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - var systemInfo discover.SystemInfo - systemInfo.System.TotalMemory = format.GibiByte - systemInfo.System.FreeMemory = 512 * format.MebiByte - systemInfo.System.FreeSwap = 256 * format.MebiByte + var systemInfo ml.SystemInfo + systemInfo.TotalMemory = format.GibiByte + systemInfo.FreeMemory = 512 * format.MebiByte + systemInfo.FreeSwap = 256 * format.MebiByte - gpus := make(discover.GpuInfoList, len(tt.gpus)) + gpus := make([]ml.DeviceInfo, len(tt.gpus)) for i := range tt.gpus { gpus[i].DeviceID = tt.gpus[i].id gpus[i].FreeMemory = uint64(tt.gpus[i].free) diff --git a/ml/device.go b/ml/device.go index 6569d87bb4..39fba7d17d 100644 --- a/ml/device.go +++ b/ml/device.go @@ -3,15 +3,21 @@ package ml import ( "context" "encoding/binary" + "encoding/json" "fmt" "hash/maphash" + "io" "log/slog" + "net/http" + "runtime" "slices" "sort" "strconv" "strings" + "time" "github.com/ollama/ollama/format" + "github.com/ollama/ollama/logutil" ) // GPULayers is a set of layers to be allocated on a single GPU @@ -282,6 +288,20 @@ type DeviceInfo struct { LibraryPath []string } +type SystemInfo struct { + // ThreadCount is the optimal number of threads to use for inference + ThreadCount int `json:"threads,omitempty"` + + // TotalMemory is the total amount of system memory + TotalMemory uint64 `json:"total_memory,omitempty"` + + // FreeMemory is the amount of memory currently available on the system for loading models + FreeMemory uint64 `json:"free_memory,omitempty"` + + // FreeSwap is the amount of system swap space reported as available + FreeSwap uint64 `json:"free_swap,omitempty"` +} + func (d DeviceInfo) Compute() string { // AMD gfx is encoded into the major minor in hex form if strings.EqualFold(d.Library, "ROCm") { @@ -294,6 +314,71 @@ func (d DeviceInfo) Driver() string { return strconv.Itoa(d.DriverMajor) + "." + strconv.Itoa(d.DriverMinor) } +// MinimumMemory reports the amount of memory that should be set aside +// on the device for overhead (e.g. VRAM consumed by context structures independent +// of model allocations) +func (d DeviceInfo) MinimumMemory() uint64 { + if d.Library == "Metal" { + return 512 * format.MebiByte + } + return 457 * format.MebiByte +} + +// Sort by Free Space. +// iGPUs are reported first, thus Reverse() yields the largest discrete GPU first +type ByFreeMemory []DeviceInfo + +func (a ByFreeMemory) Len() int { return len(a) } +func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a ByFreeMemory) Less(i, j int) bool { + if a[i].Integrated && !a[j].Integrated { + return true + } else if !a[i].Integrated && a[j].Integrated { + return false + } + return a[i].FreeMemory < a[j].FreeMemory +} + +func ByLibrary(l []DeviceInfo) [][]DeviceInfo { + resp := [][]DeviceInfo{} + libs := []string{} + for _, info := range l { + found := false + requested := info.Library + for i, lib := range libs { + if lib == requested { + resp[i] = append(resp[i], info) + found = true + break + } + } + if !found { + libs = append(libs, requested) + resp = append(resp, []DeviceInfo{info}) + } + } + return resp +} + +func LibraryPaths(l []DeviceInfo) []string { + var gpuLibs []string + for _, gpu := range l { + for _, dir := range gpu.LibraryPath { + needed := true + for _, existing := range gpuLibs { + if dir == existing { + needed = false + break + } + } + if needed { + gpuLibs = append(gpuLibs, dir) + } + } + } + return gpuLibs +} + type DeviceComparison int const ( @@ -336,3 +421,133 @@ func (a DeviceInfo) IsBetter(b DeviceInfo) bool { sort.Sort(sort.Reverse(sort.StringSlice(cmp))) return cmp[0] == bLibSplit[1] } + +// For each GPU, check if it does NOT support flash attention +func FlashAttentionSupported(l []DeviceInfo) bool { + for _, gpu := range l { + supportsFA := gpu.Library == "cpu" || + gpu.Name == "Metal" || gpu.Library == "Metal" || + (gpu.Library == "CUDA" && gpu.DriverMajor >= 7 && !(gpu.ComputeMajor == 7 && gpu.ComputeMinor == 2)) || + gpu.Library == "ROCm" + + if !supportsFA { + return false + } + } + return true +} + +// Given the list of GPUs this instantiation is targeted for, +// figure out the visible devices environment variables +func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string { + if len(l) == 0 { + return nil + } + env := map[string]string{} + for _, d := range l { + d.updateVisibleDevicesEnv(env) + } + return env +} + +func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { + var envVar string + switch d.Library { + case "ROCm": + envVar = "ROCR_VISIBLE_DEVICES" + if runtime.GOOS != "linux" { + envVar = "HIP_VISIBLE_DEVICES" + } + case "Vulkan": + envVar = "GGML_VK_VISIBLE_DEVICES" + default: + return + } + v, existing := env[envVar] + if existing { + v = v + "," + } + if d.FilteredID != "" { + v = v + d.FilteredID + } else { + v = v + d.ID + } + env[envVar] = v +} + +type BaseRunner interface { + // GetPort returns the localhost port number the runner is running on + GetPort() int + + // HasExited indicates if the runner is no longer running. This can be used during + // bootstrap to detect if a given filtered device is incompatible and triggered an assert + HasExited() bool +} + +type RunnerDiscovery interface { + BaseRunner + + // GetDeviceInfos will perform a query of the underlying device libraries + // for device identification and free VRAM information + // During bootstrap scenarios, this routine may take seconds to complete + GetDeviceInfos(ctx context.Context) []DeviceInfo +} + +type FilteredRunnerDiscovery interface { + RunnerDiscovery + + // GetActiveDeviceIDs returns the filtered set of devices actively in + // use by this runner for running models. If the runner is a bootstrap runner, no devices + // will be active yet so no device IDs are returned. + // This routine will not query the underlying device and will return immediately + GetActiveDeviceIDs() []DeviceID +} + +func GetDevicesFromRunner(ctx context.Context, runner BaseRunner) ([]DeviceInfo, error) { + var moreDevices []DeviceInfo + port := runner.GetPort() + tick := time.Tick(10 * time.Millisecond) + for { + select { + case <-ctx.Done(): + return nil, fmt.Errorf("failed to finish discovery before timeout") + case <-tick: + r, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/info", port), nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + r.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(r) + if err != nil { + // slog.Warn("failed to send request", "error", err) + if runner.HasExited() { + return nil, fmt.Errorf("runner crashed") + } + continue + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + // old runner, fall back to bootstrapping model + return nil, fmt.Errorf("llamarunner free vram reporting not supported") + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + slog.Warn("failed to read response", "error", err) + continue + } + if resp.StatusCode != 200 { + logutil.Trace("runner failed to discover free VRAM", "status", resp.StatusCode, "response", body) + return nil, fmt.Errorf("runner error: %s", string(body)) + } + + if err := json.Unmarshal(body, &moreDevices); err != nil { + slog.Warn("unmarshal encode response", "error", err) + continue + } + return moreDevices, nil + } + } +} diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index f1cd3fea26..548545cb7e 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -84,11 +84,11 @@ function buildCPU() { Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0 - & cmake --fresh --preset CPU --install-prefix $script:DIST_DIR + & cmake -B build\cpu --preset CPU --install-prefix $script:DIST_DIR if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --build --preset CPU --config Release --parallel $script:JOBS + & cmake --build build\cpu --target ggml-cpu --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component CPU --strip + & cmake --install build\cpu --component CPU --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } @@ -105,11 +105,11 @@ function buildCUDA11() { $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} write-host "Building CUDA v11 backend libraries $cuda" $env:CUDAToolkit_ROOT=$cuda - & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11" + & cmake -B build\cuda_v11 --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS + & cmake --build build\cuda_v11 --target ggml-cuda --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component "CUDA" --strip + & cmake --install build\cuda_v11 --component "CUDA" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } @@ -124,11 +124,11 @@ function buildCUDA12() { $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} write-host "Building CUDA v12 backend libraries $cuda" $env:CUDAToolkit_ROOT=$cuda - & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12" + & cmake -B build\cuda_v12 --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS + & cmake --build build\cuda_v12 --target ggml-cuda --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component "CUDA" --strip + & cmake --install build\cuda_v12 --component "CUDA" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } @@ -143,11 +143,11 @@ function buildCUDA13() { $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} $env:CUDAToolkit_ROOT=$cuda write-host "Building CUDA v13 backend libraries $cuda" - & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13" + & cmake -B build\cuda_v13 --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS + & cmake --build build\cuda_v13 --target ggml-cuda --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component "CUDA" --strip + & cmake --install build\cuda_v13 --component "CUDA" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } @@ -165,7 +165,7 @@ function buildROCm() { $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe" $env:HIP_PLATFORM="amd" $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - & cmake --fresh --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" ` + & cmake --fresh -B build\rocm --preset "ROCm 6" -G Ninja -DOLLAMA_RUNNER_DIR="rocm" ` -DCMAKE_C_COMPILER=clang ` -DCMAKE_CXX_COMPILER=clang++ ` -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" ` @@ -175,9 +175,9 @@ function buildROCm() { $env:HIPCXX="" $env:HIP_PLATFORM="" $env:CMAKE_PREFIX_PATH="" - & cmake --build --preset "ROCm 6" --config Release --parallel $script:JOBS + & cmake --build build\rocm --target ggml-hip --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & cmake --install build --component "HIP" --strip + & cmake --install build\rocm --component "HIP" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} Remove-Item -Path $script:DIST_DIR\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue } diff --git a/server/routes_debug_test.go b/server/routes_debug_test.go index 466951a1d5..bf822c68b6 100644 --- a/server/routes_debug_test.go +++ b/server/routes_debug_test.go @@ -9,9 +9,9 @@ import ( "github.com/gin-gonic/gin" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/ml" ) func TestGenerateDebugRenderOnly(t *testing.T) { @@ -37,9 +37,9 @@ func TestGenerateDebugRenderOnly(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -230,9 +230,9 @@ func TestChatDebugRenderOnly(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ diff --git a/server/routes_generate_renderer_test.go b/server/routes_generate_renderer_test.go index ea18b1e55a..e6473e0876 100644 --- a/server/routes_generate_renderer_test.go +++ b/server/routes_generate_renderer_test.go @@ -12,9 +12,9 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/ml" ) // TestGenerateWithBuiltinRenderer tests that api/generate uses built-in renderers @@ -42,9 +42,9 @@ func TestGenerateWithBuiltinRenderer(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, @@ -226,9 +226,9 @@ func TestGenerateWithDebugRenderOnly(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ llama: &mock, diff --git a/server/routes_generate_test.go b/server/routes_generate_test.go index bcd774fd11..4c6b934b3e 100644 --- a/server/routes_generate_test.go +++ b/server/routes_generate_test.go @@ -17,9 +17,9 @@ import ( "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/ml" ) type mockRunner struct { @@ -48,8 +48,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error return } -func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { - return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { +func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) { + return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) { return mock, nil } } @@ -157,9 +157,9 @@ func TestGenerateChat(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -768,9 +768,9 @@ func TestGenerate(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { // add small delay to simulate loading time.Sleep(time.Millisecond) req.successCh <- &runnerRef{ @@ -1193,9 +1193,9 @@ func TestChatWithPromptEndingInThinkTag(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { time.Sleep(time.Millisecond) req.successCh <- &runnerRef{llama: mock} return false diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go index caf2cf6d5b..8e58ad9629 100644 --- a/server/routes_harmony_streaming_test.go +++ b/server/routes_harmony_streaming_test.go @@ -14,9 +14,9 @@ import ( "github.com/gin-gonic/gin" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" + "github.com/ollama/ollama/ml" ) func getTestTools() []api.Tool { @@ -275,9 +275,9 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -426,9 +426,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 100 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } @@ -608,9 +608,9 @@ func TestChatHarmonyParserStreaming(t *testing.T) { loaded: make(map[string]*runnerRef), newServerFn: newMockServer(&mock), getGpuFn: getGpuFn, - getCpuFn: getCpuFn, + getSystemInfoFn: getSystemInfoFn, waitForRecovery: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { + loadFn: func(req *LlmRequest, _ *ggml.GGML, _ ml.SystemInfo, _ []ml.DeviceInfo, _ bool) bool { req.successCh <- &runnerRef{ llama: &mock, } diff --git a/server/sched.go b/server/sched.go index 7c63995310..e262d26fd0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -5,12 +5,9 @@ import ( "errors" "fmt" "log/slog" - "os" "reflect" - "runtime" "slices" "sort" - "strconv" "strings" "sync" "time" @@ -52,12 +49,10 @@ type Scheduler struct { activeLoading llm.LlamaServer loaded map[string]*runnerRef - loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool - newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) - getGpuFn func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList - getCpuFn func() discover.GpuInfo - - // waitForRecovery sets the limit for how long to wait for memory usage to recover after unload before scheduling the next model + loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool + newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) + getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo + getSystemInfoFn func() ml.SystemInfo waitForRecovery time.Duration } @@ -77,8 +72,8 @@ func InitScheduler(ctx context.Context) *Scheduler { unloadedCh: make(chan any, maxQueue), loaded: make(map[string]*runnerRef), newServerFn: llm.NewLlamaServer, - getGpuFn: discover.GetGPUInfo, - getCpuFn: discover.GetCPUInfo, + getGpuFn: discover.GPUDevices, + getSystemInfoFn: discover.GetSystemInfo, waitForRecovery: 5 * time.Second, } sched.loadFn = sched.load @@ -133,6 +128,8 @@ func (s *Scheduler) Run(ctx context.Context) { } func (s *Scheduler) processPending(ctx context.Context) { + maxRunners := envconfig.MaxRunners() + for { select { case <-ctx.Done(): @@ -152,7 +149,7 @@ func (s *Scheduler) processPending(ctx context.Context) { s.loadedMu.Lock() runner := s.loaded[pending.model.ModelPath] loadedCount := len(s.loaded) - runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded)) + runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded)) for _, r := range s.loaded { runnersSnapshot = append(runnersSnapshot, r) } @@ -167,39 +164,29 @@ func (s *Scheduler) processPending(ctx context.Context) { pending.useLoadedRunner(runner, s.finishedReqCh) break } - } else if envconfig.MaxRunners() > 0 && loadedCount >= int(envconfig.MaxRunners()) { + } else if maxRunners > 0 && loadedCount >= int(maxRunners) { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) runnerToExpire = s.findRunnerToUnload() } else { // Either no models are loaded or below envconfig.MaxRunners // Get a refreshed GPU list - var gpus discover.GpuInfoList + var gpus []ml.DeviceInfo if pending.opts.NumGPU == 0 { - gpus = discover.GpuInfoList{s.getCpuFn()} + gpus = []ml.DeviceInfo{} } else { gpus = s.getGpuFn(ctx, runnersSnapshot) } - - if envconfig.MaxRunners() <= 0 { - // No user specified MaxRunners, so figure out what automatic setting to use - // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs - // if any GPU has unreliable free memory reporting, 1x the number of GPUs - allReliable := true - for _, gpu := range gpus { - if gpu.UnreliableFreeMemory { - allReliable = false - break - } - } - if allReliable { - // HACK - os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(defaultModelsPerGPU*len(gpus))) - slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners(), "gpu_count", len(gpus)) + systemInfo := s.getSystemInfoFn() + if maxRunners <= 0 { + // No user specified MaxRunners, so figure out what automatic setting to use for the next load attempt + if pending.opts.NumGPU == 0 { + // Need to get actual GPU list to set the correct default max models + g := s.getGpuFn(ctx, runnersSnapshot) + maxRunners = uint(defaultModelsPerGPU * max(len(g), 1)) } else { - // HACK - os.Setenv("OLLAMA_MAX_LOADED_MODELS", strconv.Itoa(len(gpus))) - slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency") + maxRunners = uint(defaultModelsPerGPU * max(len(gpus), 1)) } + slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "gpu_count", len(gpus)) } // Load model for fitting @@ -215,14 +202,14 @@ func (s *Scheduler) processPending(ctx context.Context) { if loadedCount == 0 { // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) - s.loadFn(pending, ggml, gpus, false) + s.loadFn(pending, ggml, systemInfo, gpus, false) break } // More than one loaded model, so we have to see if the // new one fits - needEvict := s.loadFn(pending, ggml, gpus, true) + needEvict := s.loadFn(pending, ggml, systemInfo, gpus, true) if !needEvict { slog.Debug("new model fits with existing models, loading") break @@ -353,7 +340,7 @@ func (s *Scheduler) processCompleted(ctx context.Context) { runner.refMu.Unlock() } else { slog.Debug("starting background wait for VRAM recovery", "runner", runner) - runnersSnapshot := make([]discover.FilteredRunnerDiscovery, 0, len(s.loaded)) + runnersSnapshot := make([]ml.FilteredRunnerDiscovery, 0, len(s.loaded)) for _, r := range s.loaded { runnersSnapshot = append(runnersSnapshot, r) } @@ -395,7 +382,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs // (if any). Returns whether the scheduler needs to evict a model to make this one fit. -func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool { +func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool { numParallel := max(int(envconfig.NumParallel()), 1) // Embedding models should always be loaded with parallel=1 @@ -420,7 +407,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis if llama == nil { var err error - llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) + llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel) if err != nil { // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to @@ -443,9 +430,16 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis s.loadedMu.Unlock() - gpuIDs, err := llama.Load(req.ctx, gpus, requireFull) + gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull) if err != nil { if errors.Is(err, llm.ErrLoadRequiredFull) { + if !requireFull { + // No other models loaded, yet we still don't fit, so report an error + slog.Info("model is too large for system memory", "requireFull", requireFull) + s.activeLoading.Close() + s.activeLoading = nil + req.errCh <- err + } return true } @@ -456,6 +450,20 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis return false } + // Determine if we have discrete GPUs which we should monitor VRAM usage on during shutdown + discreteGPUs := false +iGPUScan: + for _, devid := range gpuIDs { + for _, dev := range gpus { + if dev.DeviceID == devid { + if !dev.Integrated { + discreteGPUs = true + break iGPUScan + } + } + } + } + runner := &runnerRef{ model: req.model, modelPath: req.model.ModelPath, @@ -463,6 +471,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis Options: &req.opts, sessionDuration: sessionDuration, gpus: gpuIDs, + discreteGPUs: discreteGPUs, vramSize: llama.VRAMSize(), totalSize: llama.TotalSize(), loading: true, @@ -510,7 +519,10 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis return false } -func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { +func (s *Scheduler) updateFreeSpace(allGpus []ml.DeviceInfo) { + if len(allGpus) == 0 { + return + } predMap := map[ml.DeviceID]uint64{} // Sum up the total predicted usage per GPU for all runners s.loadedMu.Lock() runners := make([]*runnerRef, 0, len(s.loaded)) @@ -554,12 +566,13 @@ type runnerRef struct { refMu sync.Mutex refCount uint // prevent unloading if > 0 - llama llm.LlamaServer - pid int - loading bool // True only during initial load, then false forever - gpus []ml.DeviceID // Recorded at time of provisioning - vramSize uint64 - totalSize uint64 + llama llm.LlamaServer + pid int + loading bool // True only during initial load, then false forever + gpus []ml.DeviceID // Recorded at time of provisioning + discreteGPUs bool // True if all devices are discrete GPUs - used to skip VRAM recovery check for iGPUs + vramSize uint64 + totalSize uint64 sessionDuration time.Duration expireTimer *time.Timer @@ -627,14 +640,12 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool // a before and after GPU memory allocation. The returned channel // will be notified when we're done waiting, or have timed out and should // proceed anyway -func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.FilteredRunnerDiscovery) chan any { +func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []ml.FilteredRunnerDiscovery) chan any { finished := make(chan any, 1) - // CPU or Metal don't need checking, so no waiting required - // windows can page VRAM, only cuda currently can report accurate used vram usage - if len(runner.gpus) == 0 || - (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "Metal")) || - (runtime.GOOS == "windows" && runner.gpus[0].Library != "CUDA") { + // CPU, Metal and iGPUs don't need checking, so no waiting required + if len(runner.gpus) == 0 || !runner.discreteGPUs || + (len(runner.gpus) == 1 && runner.gpus[0].Library == "Metal") { finished <- struct{}{} slog.Debug("no need to wait for VRAM recovery", "runner", runner) return finished @@ -668,7 +679,11 @@ func (s *Scheduler) waitForVRAMRecovery(runner *runnerRef, runners []discover.Fi totalMemoryNow += gpu.TotalMemory freeMemoryNow += gpu.FreeMemory } - logutil.Trace("gpu VRAM convergence", "percent", int(max(float32(freeMemoryNow-freeMemoryBefore), 0.0)/float32(runner.vramSize)*100)) + if freeMemoryNow > freeMemoryBefore { + logutil.Trace("gpu VRAM convergence", "percent", int(float32(freeMemoryNow-freeMemoryBefore)/float32(runner.vramSize)*100)) + } else { + logutil.Trace("gpu VRAM convergence", "percent", 0) + } // If we're within ~75% of the estimated memory usage recovered, bail out if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.75 { slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "free_before", format.HumanBytes2(freeMemoryBefore), "free_now", format.HumanBytes2(freeMemoryNow), "runner", runner) diff --git a/server/sched_test.go b/server/sched_test.go index 66d43338e3..316a817fe2 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -13,7 +13,6 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/app/lifecycle" - "github.com/ollama/ollama/discover" "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/llm" @@ -50,11 +49,12 @@ func TestSchedLoad(t *testing.T) { sessionDuration: &api.Duration{Duration: 2 * time.Second}, } // Fail to load model first - s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { return nil, errors.New("something failed to load model blah") } - gpus := discover.GpuInfoList{} - s.load(req, f, gpus, false) + gpus := []ml.DeviceInfo{} + systemInfo := ml.SystemInfo{} + s.load(req, f, systemInfo, gpus, false) require.Empty(t, req.successCh) require.Len(t, req.errCh, 1) s.loadedMu.Lock() @@ -64,11 +64,11 @@ func TestSchedLoad(t *testing.T) { require.Contains(t, err.Error(), "this model may be incompatible") server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil } - s.load(req, f, gpus, false) + s.load(req, f, systemInfo, gpus, false) select { case err := <-req.errCh: require.NoError(t, err) @@ -82,7 +82,7 @@ func TestSchedLoad(t *testing.T) { req.model.ModelPath = "dummy_model_path" server.waitResp = errors.New("wait failure") - s.load(req, f, gpus, false) + s.load(req, f, systemInfo, gpus, false) select { case err := <-req.errCh: require.Contains(t, err.Error(), "wait failure") @@ -106,7 +106,7 @@ type reqBundle struct { f *ggml.GGML } -func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { +func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { scenario.srv.modelPath = model return scenario.srv, nil } @@ -152,20 +152,20 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vra return b } -func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { +func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo { slog.Info("test getGpuFn called", "runners", runners) - g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} + g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}} g.TotalMemory = 24 * format.GigaByte g.FreeMemory = 12 * format.GigaByte - return []discover.GpuInfo{g} + return []ml.DeviceInfo{g} } -func getCpuFn() discover.GpuInfo { - slog.Info("test getCpuFn called") - g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}} - g.TotalMemory = 32 * format.GigaByte - g.FreeMemory = 26 * format.GigaByte - return g +func getSystemInfoFn() ml.SystemInfo { + slog.Info("test getSystemInfoFn called") + return ml.SystemInfo{ + TotalMemory: 32 * format.GigaByte, + FreeMemory: 26 * format.GigaByte, + } } func TestSchedRequestsSameModelSameRequest(t *testing.T) { @@ -174,7 +174,7 @@ func TestSchedRequestsSameModelSameRequest(t *testing.T) { s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond s.getGpuFn = getGpuFn - s.getCpuFn = getCpuFn + s.getSystemInfoFn = getSystemInfoFn a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil) b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil) b.req.model = a.req.model @@ -218,7 +218,7 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond s.getGpuFn = getGpuFn - s.getCpuFn = getCpuFn + s.getSystemInfoFn = getSystemInfoFn a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil) b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil) tmpModel := *a.req.model @@ -251,12 +251,12 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { a.ctxDone() // Report recovered VRAM usage time.Sleep(1 * time.Millisecond) - s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { - slog.Info("XXX altered getGpuFn called") - g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} + s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo { + slog.Info("altered getGpuFn called") + g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}} g.TotalMemory = 24 * format.GigaByte g.FreeMemory = 24 * format.GigaByte - return []discover.GpuInfo{g} + return []ml.DeviceInfo{g} } select { case resp := <-b.req.successCh: @@ -271,26 +271,26 @@ func TestSchedRequestsSimpleReloadSameModel(t *testing.T) { } func TestSchedRequestsMultipleLoadedModels(t *testing.T) { - ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) + slog.Info("TestRequestsMultipleLoadedModels") + ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond) defer done() s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond - s.getGpuFn = getGpuFn // 1 metal GPU - s.getCpuFn = getCpuFn // 1 CPU + s.getGpuFn = getGpuFn // 1 Metal GPU + s.getSystemInfoFn = getSystemInfoFn // Multiple loaded models - a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte}) + a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte}) a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} - b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte}) + b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte}) b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond} c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */) c.req.opts.NumGPU = 0 // CPU load, will be allowed b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond} // longer than b to cause the scheduler to favor unloading b over c - d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded + d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded - t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1") s.newServerFn = a.newServer - slog.Info("a") + slog.Info("Loading A") s.pendingReqCh <- a.req s.Run(ctx) select { @@ -309,7 +309,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0") s.newServerFn = b.newServer - slog.Info("b") + slog.Info("Loading B") s.pendingReqCh <- b.req select { case resp := <-b.req.successCh: @@ -327,7 +327,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { // This is a CPU load with NumGPU = 0 so it should load s.newServerFn = c.newServer - slog.Info("c") + slog.Info("Loading C") s.pendingReqCh <- c.req select { case resp := <-c.req.successCh: @@ -337,6 +337,7 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { case err := <-c.req.errCh: t.Fatal(err.Error()) case <-ctx.Done(): + slog.Info("FAIL: scheduler state", "s.loaded", s.loaded) t.Fatal("timeout") } s.loadedMu.Lock() @@ -361,11 +362,11 @@ func TestSchedRequestsMultipleLoadedModels(t *testing.T) { b.ctxDone() // Report recovered VRAM usage so scheduler will finish waiting and unload time.Sleep(1 * time.Millisecond) - s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { - g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} + s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo { + g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}} g.TotalMemory = 24 * format.GigaByte g.FreeMemory = 24 * format.GigaByte - return []discover.GpuInfo{g} + return []ml.DeviceInfo{g} } select { case resp := <-d.req.successCh: @@ -404,7 +405,7 @@ func TestSchedGetRunner(t *testing.T) { s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond s.getGpuFn = getGpuFn - s.getCpuFn = getCpuFn + s.getSystemInfoFn = getSystemInfoFn s.newServerFn = a.newServer slog.Info("a") successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration) @@ -462,13 +463,14 @@ func TestSchedExpireRunner(t *testing.T) { } var f *ggml.GGML - gpus := discover.GpuInfoList{} + gpus := []ml.DeviceInfo{} + systemInfo := ml.SystemInfo{} server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}} - s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { server.modelPath = model return server, nil } - s.load(req, f, gpus, false) + s.load(req, f, systemInfo, gpus, false) select { case err := <-req.errCh: @@ -497,19 +499,15 @@ func TestSchedExpireRunner(t *testing.T) { // TODO - add one scenario that triggers the bogus finished event with positive ref count func TestSchedPrematureExpired(t *testing.T) { - ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond) + ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond) defer done() // Same model, same request - scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil) + scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil) s := InitScheduler(ctx) s.waitForRecovery = 10 * time.Millisecond - s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList { - g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}} - g.TotalMemory = 24 * format.GigaByte - g.FreeMemory = 12 * format.GigaByte - return []discover.GpuInfo{g} - } + s.getGpuFn = getGpuFn + s.getSystemInfoFn = getSystemInfoFn s.newServerFn = scenario1a.newServer successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration) require.Len(t, s.pendingReqCh, 1) @@ -574,7 +572,7 @@ func TestSchedUseLoadedRunner(t *testing.T) { func TestSchedUpdateFreeSpace(t *testing.T) { ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) defer done() - gpus := discover.GpuInfoList{ + gpus := []ml.DeviceInfo{ { DeviceID: ml.DeviceID{ ID: "1", @@ -756,8 +754,12 @@ func (s *mockLlm) ModelPath() string { return s.modelPath } -func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) { +func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) { if requireFull { + if len(gpus) == 0 { + slog.Info("mockLlm.Load CPU based load") + return nil, nil + } for _, g := range gpus { if g.FreeMemory >= s.vramSize { return []ml.DeviceID{g.DeviceID}, nil