mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 07:17:26 +01:00
Use runners for GPU discovery (#12090)
This revamps how we discover GPUs in the system by leveraging the Ollama runner. This should eliminate inconsistency between our GPU discovery and the runners capabilities at runtime, particularly for cases where we try to filter out unsupported GPUs. Now the runner does that implicitly based on the actual device list. In some cases free VRAM reporting can be unreliable which can leaad to scheduling mistakes, so this also includes a patch to leverage more reliable VRAM reporting libraries if available. Automatic workarounds have been removed as only one GPU leveraged this, which is now documented. This GPU will soon fall off the support matrix with the next ROCm bump. Additional cleanup of the scheduler and discovery packages can be done in the future once we have switched on the new memory management code, and removed support for the llama runner.
This commit is contained in:
139
llm/server.go
139
llm/server.go
@@ -66,7 +66,7 @@ func (e filteredEnv) LogValue() slog.Value {
|
||||
|
||||
type LlamaServer interface {
|
||||
ModelPath() string
|
||||
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error
|
||||
Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error)
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
@@ -76,8 +76,11 @@ type LlamaServer interface {
|
||||
Close() error
|
||||
VRAMSize() uint64 // Total VRAM across all GPUs
|
||||
TotalSize() uint64
|
||||
VRAMByGPU(gpuID string) uint64
|
||||
VRAMByGPU(id ml.DeviceID) uint64
|
||||
Pid() int
|
||||
GetPort() int
|
||||
GetDeviceInfos(ctx context.Context) []ml.DeviceInfo
|
||||
HasExited() bool
|
||||
}
|
||||
|
||||
// llmServer is an instance of a runner hosting a single model
|
||||
@@ -331,6 +334,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
if gpu.DependencyPath != nil {
|
||||
slog.Debug("adding gpu dependency paths", "paths", gpu.DependencyPath)
|
||||
libraryPaths = append(gpu.DependencyPath, libraryPaths...)
|
||||
ggmlPaths = append(ggmlPaths, gpu.DependencyPath...)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -361,12 +365,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
|
||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
|
||||
|
||||
envWorkarounds := []string{}
|
||||
for _, gpu := range gpus {
|
||||
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
|
||||
}
|
||||
// Always filter down the set of GPUs in case there are any unsupported devices that might crash
|
||||
envWorkarounds = append(envWorkarounds, gpus.GetVisibleDevicesEnv()...)
|
||||
envWorkarounds := gpus.GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
// Update or add the path variable with our adjusted version
|
||||
@@ -496,7 +496,7 @@ type LoadResponse struct {
|
||||
|
||||
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
|
||||
|
||||
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
|
||||
func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
systemInfo := discover.GetSystemInfo()
|
||||
systemTotalMemory := systemInfo.System.TotalMemory
|
||||
systemFreeMemory := systemInfo.System.FreeMemory
|
||||
@@ -509,7 +509,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
|
||||
} else {
|
||||
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
|
||||
return ErrLoadRequiredFull
|
||||
return nil, ErrLoadRequiredFull
|
||||
}
|
||||
}
|
||||
|
||||
@@ -518,13 +518,13 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
if len(gpus) > 1 || gpus[0].Library != "cpu" {
|
||||
switch {
|
||||
case gpus[0].Library == "metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
|
||||
case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.System.TotalMemory:
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
s.options.NumGPU = 0
|
||||
case gpus[0].Library != "metal" && s.estimate.Layers == 0:
|
||||
case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
gpus = discover.GetCPUInfo()
|
||||
gpus = discover.GpuInfoList{discover.GetCPUInfo()}
|
||||
case s.options.NumGPU < 0 && s.estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
s.options.NumGPU = s.estimate.Layers
|
||||
}
|
||||
@@ -537,7 +537,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
available := systemInfo.System.FreeMemory + systemInfo.System.FreeSwap
|
||||
if systemMemoryRequired > available {
|
||||
slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.System.TotalMemory), "free", format.HumanBytes2(systemInfo.System.FreeMemory), "swap", format.HumanBytes2(systemInfo.System.FreeSwap))
|
||||
return fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
||||
return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -552,7 +552,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
// mmap has issues with partial offloading on metal
|
||||
for _, g := range gpus {
|
||||
if g.Library == "metal" &&
|
||||
if g.Library == "Metal" &&
|
||||
uint64(s.options.NumGPU) > 0 &&
|
||||
uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
|
||||
s.options.UseMMap = new(bool)
|
||||
@@ -563,7 +563,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
// Windows CUDA should not use mmap for best performance
|
||||
// Linux with a model larger than free space, mmap leads to thrashing
|
||||
// For CPU loads we want the memory to be allocated, not FS cache
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && s.options.UseMMap == nil) ||
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
|
||||
(runtime.GOOS == "linux" && systemInfo.System.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
|
||||
(gpus[0].Library == "cpu" && s.options.UseMMap == nil) ||
|
||||
(s.options.UseMMap != nil && !*s.options.UseMMap) {
|
||||
@@ -572,12 +572,12 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
}
|
||||
|
||||
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// On the Ollama engine, we can print out a summary of the memory allocations.
|
||||
@@ -588,16 +588,16 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
|
||||
|
||||
if !resp.Success {
|
||||
slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
|
||||
return errors.New("failed to allocate memory for model")
|
||||
return nil, errors.New("failed to allocate memory for model")
|
||||
}
|
||||
|
||||
// The llama engine does its memory allocations together with model loading, so we
|
||||
// need to wait until it is done to ensure that we have accurate memory data before
|
||||
// loading the next model
|
||||
if s.textProcessor == nil {
|
||||
return s.WaitUntilRunning(ctx)
|
||||
return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
|
||||
} else {
|
||||
return nil
|
||||
return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -610,7 +610,7 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
|
||||
|
||||
gpuLayers := make(ml.GPULayersList, len(gpus))
|
||||
for i := range gpuLayers {
|
||||
gpuLayers[i].ID = gpus[i].ID
|
||||
gpuLayers[i].DeviceID = gpus[i].DeviceID
|
||||
}
|
||||
|
||||
var sum float32
|
||||
@@ -658,7 +658,9 @@ func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus discover.Gpu
|
||||
//
|
||||
// This process is repeated for higher levels of loading the model (fit, allocate, commit). The earlier levels are quicker,
|
||||
// allowing for faster iteration, but may return less information.
|
||||
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
|
||||
//
|
||||
// Returns the list of GPU IDs that were used in the final allocation on success
|
||||
func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
|
||||
var success bool
|
||||
defer func() {
|
||||
if !success {
|
||||
@@ -683,7 +685,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
||||
if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory {
|
||||
available = 0
|
||||
}
|
||||
slog.Info("gpu memory", "id", gpu.ID,
|
||||
slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
|
||||
"available", format.HumanBytes2(available),
|
||||
"free", format.HumanBytes2(gpu.FreeMemory),
|
||||
"minimum", format.HumanBytes2(gpu.MinimumMemory),
|
||||
@@ -696,11 +698,11 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
|
||||
|
||||
gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := s.waitUntilRunnerLaunched(ctx); err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nextOperation:
|
||||
@@ -710,7 +712,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = gpuLayers
|
||||
resp, err := s.initModel(ctx, s.loadRequest, operation)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Memory.Log(slog.LevelDebug)
|
||||
@@ -722,7 +724,7 @@ nextOperation:
|
||||
for {
|
||||
newGPULayers, err := s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("new layout created", "layers", newGPULayers)
|
||||
@@ -756,7 +758,7 @@ nextOperation:
|
||||
newGPULayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, backoff)
|
||||
s.options.NumGPU = -1
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("new layout created", "layers", newGPULayers)
|
||||
@@ -764,7 +766,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = newGPULayers
|
||||
resp, err = s.initModel(ctx, s.loadRequest, operation)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resp.Memory.Log(slog.LevelDebug)
|
||||
@@ -773,7 +775,7 @@ nextOperation:
|
||||
if resp.Success {
|
||||
verifyGPULayers, err := s.createLayout(systemInfo, gpus, &resp.Memory, requireFull, backoff)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
slog.Debug("verifying layout", "layers", verifyGPULayers)
|
||||
@@ -798,7 +800,7 @@ nextOperation:
|
||||
}
|
||||
|
||||
if s.options.NumGPU >= 0 {
|
||||
return fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
|
||||
return nil, fmt.Errorf("memory layout cannot be allocated with num_gpu = %v", s.options.NumGPU)
|
||||
}
|
||||
|
||||
// Memory allocation failed even though we created a layout that we thought should
|
||||
@@ -808,7 +810,7 @@ nextOperation:
|
||||
// space.
|
||||
if backoff > 1 {
|
||||
slog.Warn("memory layout cannot be allocated", "memory", resp.Memory)
|
||||
return errors.New("memory layout cannot be allocated")
|
||||
return nil, errors.New("memory layout cannot be allocated")
|
||||
} else if backoff == 0 {
|
||||
backoff = 0.01
|
||||
} else {
|
||||
@@ -823,7 +825,7 @@ nextOperation:
|
||||
s.loadRequest.GPULayers = gpuLayers
|
||||
resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
success = resp.Success
|
||||
@@ -831,10 +833,27 @@ nextOperation:
|
||||
|
||||
if !success {
|
||||
slog.Warn("failed to commit memory for model", "memory", resp.Memory)
|
||||
return errors.New("failed to commit memory for model")
|
||||
return nil, errors.New("failed to commit memory for model")
|
||||
}
|
||||
|
||||
return nil
|
||||
return uniqueDeviceIDs(gpuLayers), nil
|
||||
}
|
||||
|
||||
func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
|
||||
devices := []ml.DeviceID{}
|
||||
for _, layer := range gpuLayers {
|
||||
new := true
|
||||
for _, ID := range devices {
|
||||
if layer.DeviceID == ID {
|
||||
new = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if new {
|
||||
devices = append(devices, layer.DeviceID)
|
||||
}
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// createLayout uses the current best view of memory requirements and creates a layout of model layers on GPUs.
|
||||
@@ -879,7 +898,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
for i := range gl {
|
||||
found := false
|
||||
for j := range memory.GPUs {
|
||||
if gl[i].ID == memory.GPUs[j].ID {
|
||||
if gl[i].DeviceID == memory.GPUs[j].DeviceID {
|
||||
if memory.GPUs[j].Graph != 0 {
|
||||
lastUsedGPU = i
|
||||
}
|
||||
@@ -891,7 +910,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
gl[i].FreeMemory = 0
|
||||
}
|
||||
|
||||
slog.Debug("available gpu", "id", gl[i].ID,
|
||||
slog.Debug("available gpu", "id", gl[i].ID, "library", gl[i].Library,
|
||||
"available layer vram", format.HumanBytes2(gl[i].FreeMemory),
|
||||
"backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory),
|
||||
"overhead", format.HumanBytes2(envconfig.GpuOverhead()),
|
||||
@@ -918,7 +937,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
|
||||
var vramSize uint64
|
||||
for _, gl := range gpuLayers {
|
||||
for _, gpu := range memory.GPUs {
|
||||
if gl.ID == gpu.ID {
|
||||
if gl.DeviceID == gpu.DeviceID {
|
||||
vramSize += gpu.Graph
|
||||
break
|
||||
}
|
||||
@@ -1039,7 +1058,7 @@ func findBestFit(layers []uint64, gpus discover.GpuInfoList, requestedLayers int
|
||||
// greedyFit assigns layers incrementally to GPUs, spilling over as each runs out of free space
|
||||
func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, requestedLayers int) (gpuLayers ml.GPULayersList) {
|
||||
device := len(gpus) - 1
|
||||
gpuLayers = ml.GPULayersList{{ID: gpus[device].ID}}
|
||||
gpuLayers = ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}
|
||||
freeSpace := uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||
for i := len(layers) - 1; i >= 0; i-- {
|
||||
if requestedLayers >= 0 && len(layers)-1-i >= requestedLayers {
|
||||
@@ -1057,7 +1076,7 @@ func greedyFit(layers []uint64, gpus discover.GpuInfoList, capacity float32, req
|
||||
if device < 0 {
|
||||
return gpuLayers
|
||||
}
|
||||
gpuLayers = append(ml.GPULayersList{{ID: gpus[device].ID}}, gpuLayers...)
|
||||
gpuLayers = append(ml.GPULayersList{{DeviceID: gpus[device].DeviceID}}, gpuLayers...)
|
||||
freeSpace = uint64(float32(gpus[device].FreeMemory) * capacity)
|
||||
}
|
||||
}
|
||||
@@ -1312,6 +1331,17 @@ func (s *llmServer) Pid() int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func (s *llmServer) GetPort() int {
|
||||
return s.port
|
||||
}
|
||||
|
||||
func (s *llmServer) HasExited() bool {
|
||||
if s.cmd != nil && s.cmd.ProcessState != nil && s.cmd.ProcessState.ExitCode() >= 0 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var grammarJSON = `
|
||||
root ::= object
|
||||
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||
@@ -1386,7 +1416,7 @@ type CompletionResponse struct {
|
||||
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
|
||||
slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)
|
||||
logutil.Trace("completion request", "prompt", req.Prompt)
|
||||
|
||||
if len(req.Format) > 0 {
|
||||
switch string(req.Format) {
|
||||
@@ -1552,7 +1582,7 @@ type EmbeddingResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, error) {
|
||||
slog.Log(ctx, logutil.LevelTrace, "embedding request", "input", input)
|
||||
logutil.Trace("embedding request", "input", input)
|
||||
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
@@ -1704,9 +1734,9 @@ func (s *llamaServer) TotalSize() uint64 {
|
||||
return s.estimate.TotalSize
|
||||
}
|
||||
|
||||
func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||
for i, gpu := range s.gpus {
|
||||
if gpu.ID == gpuID {
|
||||
if gpu.DeviceID == id {
|
||||
if i < len(s.estimate.GPUSizes) {
|
||||
return s.estimate.GPUSizes[i]
|
||||
}
|
||||
@@ -1715,6 +1745,11 @@ func (s *llamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||
slog.Debug("llamarunner free vram reporting not supported")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ollamaServer) VRAMSize() uint64 {
|
||||
if s.mem == nil {
|
||||
return 0
|
||||
@@ -1757,16 +1792,28 @@ func (s *ollamaServer) TotalSize() uint64 {
|
||||
return mem
|
||||
}
|
||||
|
||||
func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 {
|
||||
func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
|
||||
if s.mem == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
for _, g := range s.mem.GPUs {
|
||||
if g.ID == gpuID {
|
||||
if g.DeviceID == id {
|
||||
return g.Size()
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (s *ollamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
|
||||
devices, err := discover.GetDevicesFromRunner(ctx, s)
|
||||
if err != nil {
|
||||
if s.cmd != nil && s.cmd.ProcessState == nil {
|
||||
// Still running but hit an error, log
|
||||
slog.Debug("failure refreshing GPU information", "error", err)
|
||||
}
|
||||
// else no longer running so suppress logging as a failure is expected
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user