diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 909104c642..0b5d37a7bd 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -797,73 +797,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri return } -func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { - if llm.KV().Uint("vision.block_count") == 0 { - return - } - - for name, layer := range llm.Tensors().GroupLayers() { - if name == "v" || strings.HasPrefix(name, "v.") { - for _, tensor := range layer { - weights += tensor.Size() - } - } - } - - imageSize := uint64(llm.KV().Uint("vision.image_size")) - patchSize := uint64(llm.KV().Uint("vision.patch_size")) - if patchSize == 0 { - slog.Warn("unknown patch size for vision model") - return - } - - numChannels := uint64(llm.KV().Uint("vision.num_channels")) - - numPatches := (imageSize / patchSize) * (imageSize / patchSize) - if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok { - numPatches++ - } - - headCount := uint64(llm.KV().Uint("vision.attention.head_count")) - embeddingLength := uint64(llm.KV().Uint("vision.embedding_length")) - - switch llm.KV().Architecture() { - case "mllama": - numPaddedPatches := numPatches + 8 - (numPatches%8)%8 - - maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles")) - - graphSize = 4 * (8 + - imageSize*imageSize*numChannels*maxNumTiles + - embeddingLength*numPatches*maxNumTiles + - 9*embeddingLength*numPaddedPatches*maxNumTiles + - numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount) - case "gemma3", "mistral3": - graphSize = 4 * (imageSize*imageSize*numChannels + - embeddingLength*patchSize + - numPatches*numPatches*headCount) - case "qwen25vl": - maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280)) - - numPatches := maxPixels / (patchSize * patchSize) - - graphSize = 4 * (maxPixels*numChannels + // Original image storage - // Normalized pixels - maxPixels*numChannels + - // Patches storage (numPatches * channels * patchSize^2) - numPatches*numChannels*patchSize*patchSize + - // Self-attention calculations - numPatches*numPatches*headCount + - // Additional buffer for processing - embeddingLength*numPatches) - case "llama4": - // vision graph is computed independently in the same schedule - // and is negligible compared to the worst case text graph - } - - return weights, graphSize -} - // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { if cacheType == "" || cacheType == "f16" { diff --git a/llm/memory.go b/llm/memory.go deleted file mode 100644 index 15558109f6..0000000000 --- a/llm/memory.go +++ /dev/null @@ -1,516 +0,0 @@ -package llm - -import ( - "fmt" - "log/slog" - "os" - "slices" - "sort" - "strings" - - "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" - "github.com/ollama/ollama/format" - "github.com/ollama/ollama/fs/ggml" - "github.com/ollama/ollama/ml" -) - -// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits -// The list of GPUs returned will always be the same brand (library) -// If the model can not be fit fully within the available GPU(s) nil is returned -func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo { - for _, gl := range ml.ByLibrary(gpus) { - sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...) - - // TODO - potentially sort by performance capability, existing models loaded, etc. - // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them - // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups - sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl))) - - if !envconfig.SchedSpread() { - // Try to pack into as few GPUs as possible, starting from 1 GPU - for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { - gpuSubset := sgl[:numGPUs] - ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel) - - if ok { - slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", - "model", modelPath, - "library", sgl[0].Library, - "parallel", numParallel, - "required", format.HumanBytes2(estimatedVRAM), - "gpus", numGPUs) - return gpuSubset - } - } - } else { - // TODO future refinements - // - if multiple Libraries, see if any single GPU in any Library will fit - // - try subsets of GPUs instead of just falling back to 1 or all in a family - - // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set) - if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok { - slog.Info("new model will fit in available VRAM, loading", - "model", modelPath, - "library", sgl[0].Library, - "parallel", numParallel, - "required", format.HumanBytes2(estimatedVRAM), - "gpus", len(sgl)) - return sgl - } - } - } - return nil -} - -// If multiple Libraries are detected, pick the Library which loads the most layers for the model -func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo { - byLibrary := ml.ByLibrary(gpus) - if len(byLibrary) <= 1 { - return gpus - } - var bestEstimate uint64 - var bestFit int - for i, gl := range byLibrary { - _, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel) - if estimatedVRAM > bestEstimate { - bestEstimate = estimatedVRAM - bestFit = i - } - } - return byLibrary[bestFit] -} - -// This algorithm looks for a complete fit to determine if we need to unload other models -func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) { - // Split up the GPUs by type and try them - var estimatedVRAM uint64 - for _, gpus := range ml.ByLibrary(allGpus) { - var layerCount int - estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel) - layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize - if opts.NumGPU < 0 { - if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) { - return true, estimatedVRAM - } - } else { - if layerCount > 0 && layerCount >= opts.NumGPU { - return true, estimatedVRAM - } - } - } - return false, estimatedVRAM -} - -func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool { - estimate := estimateGPULayers(nil, f, projectors, opts, numParallel) - if estimate.TotalSize > systemInfo.FreeMemory { - return false - } - slog.Info("new model will fit in available system memory for CPU inference, loading", - "model", modelPath, - "parallel", numParallel, - "required", format.HumanBytes2(estimate.TotalSize), - ) - return true -} - -type MemoryEstimate struct { - // How many layers we predict we can load - Layers int - - // The size of the graph which occupies the main GPU - Graph uint64 - - // How much VRAM will be allocated given the number of layers we predict - VRAMSize uint64 - - // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize - TotalSize uint64 - - // For multi-GPU scenarios, this provides the tensor split parameter - TensorSplit []int - - // For multi-GPU scenarios, this is the size in bytes per GPU - GPUSizes []uint64 - - // internal fields for logging purposes - inferenceLibrary string - layersRequested int - layersModel int - availableList []string - kv uint64 - allocationsList []string - memoryWeights uint64 - memoryLayerOutput uint64 - graphFullOffload uint64 - graphPartialOffload uint64 - - projectorWeights, projectorGraph uint64 -} - -// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size -// The GPUs provided must all be the same Library -func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate { - // Graph size for a partial offload, applies to all GPUs - var graphPartialOffload uint64 - - // Graph size when all layers are offloaded, applies to all GPUs - var graphFullOffload uint64 - - // Final graph offload once we know full or partial - var graphOffload uint64 - - // Projectors loaded into GPU0 only - var llamaEngineProjectorWeights uint64 - - // Projectors loaded with output layer - var ollamaEngineProjectorWeights uint64 - var ollamaEngineProjectorGraph uint64 - - // Conditional output size on GPU 0 - var memoryLayerOutput uint64 - - // The sizes of a layer - var layerSize uint64 - - // The sum of all the layer sizes (just for logging) - var memoryWeights uint64 - - // True if all the layers are loaded - var fullyLoaded bool - - // Overflow that didn't fit into the GPU - var overflow uint64 - - overhead := envconfig.GpuOverhead() - availableList := make([]string, len(gpus)) - libraries := []string{} - for i, gpu := range gpus { - availableList[i] = format.HumanBytes2(gpu.FreeMemory) - if !slices.Contains(libraries, gpu.Library) { - libraries = append(libraries, gpu.Library) - } - } - if len(libraries) == 0 { - libraries = []string{"cpu"} - } - slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList) - - for _, projector := range projectors { - llamaEngineProjectorWeights += projectorMemoryRequirements(projector) - } - if llamaEngineProjectorWeights == 0 { - ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize() - } - - layers := f.Tensors().GroupLayers() - // add one layer worth of memory as a buffer - if blk0, ok := layers["blk.0"]; ok { - layerSize = blk0.Size() - } else { - slog.Warn("model missing blk.0 layer size") - } - - useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) && - ml.FlashAttentionSupported(gpus) && - f.SupportsFlashAttention() - - var kvct string - if useFlashAttention { - requested := strings.ToLower(envconfig.KvCacheType()) - if f.SupportsKVCacheType(requested) { - kvct = requested - } - } - - kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention) - - if len(kv) > 0 { - layerSize += kv[0] - } - - var kvTotal uint64 - for _, kvLayer := range kv { - kvTotal += kvLayer - } - - if graphPartialOffload == 0 { - headsKV := f.KV().HeadCountKVMin() - if headsKV == 0 { - headsKV = 1 - } - gqa := f.KV().HeadCountMax() / headsKV - graphPartialOffload = gqa * kvTotal / 6 - } - if graphFullOffload == 0 { - graphFullOffload = graphPartialOffload - } - - // on metal there's no partial offload overhead - if len(gpus) > 0 && gpus[0].Library == "Metal" { - graphPartialOffload = graphFullOffload - } else if len(gpus) > 1 { - // multigpu should always use the partial graph size - graphFullOffload = graphPartialOffload - } - - // Output layer handled at the end if we have space - if layer, ok := layers["output_norm"]; ok { - memoryLayerOutput += layer.Size() - } - if layer, ok := layers["output"]; ok { - memoryLayerOutput += layer.Size() - } else if layer, ok := layers["token_embd"]; ok { - memoryLayerOutput += layer.Size() - } - - gpuZeroOverhead := llamaEngineProjectorWeights - - // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer - var layerCount int - tensorSplit := make([]int, len(gpus)) - gpuAllocations := make([]uint64, len(gpus)) - type gs struct { - i int - g *ml.DeviceInfo - } - gpusWithSpace := []gs{} - for i := range gpus { - var gzo uint64 - if len(gpusWithSpace) == 0 { - gzo = gpuZeroOverhead - } - // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer - if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize { - slog.Debug("gpu has too little memory to allocate any layers", - "id", gpus[i].ID, - "library", gpus[i].Library, - "compute", gpus[i].Compute(), - "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor), - "name", gpus[i].Name, - "total", format.HumanBytes2(gpus[i].TotalMemory), - "available", format.HumanBytes2(gpus[i].FreeMemory), - "minimum_memory", gpus[i].MinimumMemory, - "layer_size", format.HumanBytes2(layerSize), - "gpu_zer_overhead", format.HumanBytes2(gzo), - "partial_offload", format.HumanBytes2(graphPartialOffload), - "full_offload", format.HumanBytes2(graphFullOffload), - ) - continue - } - gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) - gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full - } - - var gpuZeroID int - if len(gpusWithSpace) > 0 { - gpuZeroID = gpusWithSpace[0].i - gpuAllocations[gpuZeroID] += gpuZeroOverhead - } else { - overflow += gpuZeroOverhead - } - - // For all the layers, find where they can fit on the GPU(s) - for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- { - // Some models have inconsistent layer sizes - if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { - layerSize = blk.Size() - layerSize += kv[i] - memoryWeights += blk.Size() - } - - if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { - // Stop allocating on GPU(s) once we hit the users target NumGPU - overflow += layerSize - continue - } - - // distribute the layers across the GPU(s) that have space - for j := len(gpusWithSpace); j > 0; j-- { - g := gpusWithSpace[i%j] - used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > overhead+used+layerSize { - gpuAllocations[g.i] += layerSize - tensorSplit[g.i]++ - layerCount++ - break - } else { - gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) - } - } - - if len(gpusWithSpace) == 0 { - overflow += layerSize - } - } - if layerCount >= int(f.KV().BlockCount()) { - fullyLoaded = true - } - - // Determine if we need to consider output then find where it fits - memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph - if memoryLastLayer > 0 { - if opts.NumGPU < 0 || layerCount < opts.NumGPU { - for j := len(gpusWithSpace); j > 0; j-- { - g := gpusWithSpace[layerCount%j] - used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > overhead+used+memoryLastLayer { - gpuAllocations[g.i] += memoryLastLayer - tensorSplit[g.i]++ - layerCount++ - break - } - } - } - - if layerCount < int(f.KV().BlockCount())+1 { - fullyLoaded = false - overflow += memoryLastLayer - } - } - - // Add the applicable (full or partial) graph allocations - for i := range gpus { - if tensorSplit[i] <= 0 { - continue - } - if fullyLoaded { - gpuAllocations[i] += graphFullOffload - } else { - gpuAllocations[i] += graphPartialOffload - } - } - if fullyLoaded { - graphOffload = graphFullOffload - } else { - graphOffload = graphPartialOffload - } - - // Summaries for the log - var memoryRequiredPartial, memoryRequiredTotal uint64 - for i := range gpuAllocations { - memoryRequiredPartial += gpuAllocations[i] - } - memoryRequiredTotal = memoryRequiredPartial + overflow - - allocationsList := []string{} - for _, a := range gpuAllocations { - allocationsList = append(allocationsList, format.HumanBytes2(a)) - } - - estimate := MemoryEstimate{ - TotalSize: memoryRequiredTotal, - Layers: 0, - Graph: 0, - VRAMSize: 0, - GPUSizes: []uint64{}, - - inferenceLibrary: strings.Join(libraries, ","), - layersRequested: opts.NumGPU, - layersModel: int(f.KV().BlockCount()) + 1, - availableList: availableList, - kv: kvTotal, - allocationsList: allocationsList, - memoryWeights: memoryWeights, - memoryLayerOutput: memoryLayerOutput, - graphFullOffload: graphFullOffload, - graphPartialOffload: graphPartialOffload, - projectorWeights: llamaEngineProjectorWeights + ollamaEngineProjectorWeights, - projectorGraph: ollamaEngineProjectorGraph, - } - - if len(gpus) == 0 { - return estimate - } - if layerCount == 0 { - slog.Debug("insufficient VRAM to load any model layers") - return estimate - } - estimate.Layers = layerCount - estimate.Graph = graphOffload - estimate.VRAMSize = memoryRequiredPartial - estimate.TotalSize = memoryRequiredTotal - estimate.TensorSplit = tensorSplit - estimate.GPUSizes = gpuAllocations - return estimate -} - -func (m MemoryEstimate) LogValue() slog.Value { - attrs := []slog.Attr{ - slog.String("library", m.inferenceLibrary), - slog.Group( - "layers", - // requested number of layers to offload - "requested", m.layersRequested, - // The number of layers the model has (including output) - "model", m.layersModel, - // estimated number of layers that can be offloaded - "offload", m.Layers, - // multi-gpu split for tensors - "split", m.TensorSplit, - ), - slog.Group( - "memory", - // memory available by GPU for offloading - "available", m.availableList, - "gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()), - slog.Group( - "required", - // memory required for full offloading - "full", format.HumanBytes2(m.TotalSize), - // memory required to offload layers.estimate layers - "partial", format.HumanBytes2(m.VRAMSize), - // memory of KV cache - "kv", format.HumanBytes2(m.kv), - // Allocations across the GPUs - "allocations", m.allocationsList, - ), - slog.Group( - "weights", - // memory of the weights - "total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput), - // memory of repeating layers - "repeating", format.HumanBytes2(m.memoryWeights), - // memory of non-repeating layers - "nonrepeating", format.HumanBytes2(m.memoryLayerOutput), - ), - slog.Group( - "graph", - // memory of graph when fully offloaded - "full", format.HumanBytes2(m.graphFullOffload), - // memory of graph when not fully offloaded - "partial", format.HumanBytes2(m.graphPartialOffload), - ), - ), - } - - if m.projectorWeights > 0 { - attrs = append(attrs, slog.Group( - "projector", - "weights", format.HumanBytes2(m.projectorWeights), - "graph", format.HumanBytes2(m.projectorGraph), - )) - } - - return slog.GroupValue(attrs...) -} - -func projectorMemoryRequirements(filename string) (weights uint64) { - file, err := os.Open(filename) - if err != nil { - return 0 - } - defer file.Close() - - ggml, err := ggml.Decode(file, 1024) - if err != nil { - return 0 - } - - for _, layer := range ggml.Tensors().GroupLayers() { - weights += layer.Size() - } - - return weights -} diff --git a/llm/memory_test.go b/llm/memory_test.go deleted file mode 100644 index fce17b9c25..0000000000 --- a/llm/memory_test.go +++ /dev/null @@ -1,130 +0,0 @@ -package llm - -import ( - "bytes" - "fmt" - "os" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/ollama/ollama/api" - "github.com/ollama/ollama/format" - "github.com/ollama/ollama/fs/ggml" - "github.com/ollama/ollama/ml" -) - -func TestEstimateGPULayers(t *testing.T) { - t.Setenv("OLLAMA_DEBUG", "1") - t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16 - t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048") - - modelName := "dummy" - f, err := os.CreateTemp(t.TempDir(), modelName) - require.NoError(t, err) - defer f.Close() - inputLayerCount := 5 - - tensors := []*ggml.Tensor{ - {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))}, - } - assert.Len(t, tensors, inputLayerCount+1) - err = ggml.WriteGGUF(f, ggml.KV{ - "general.architecture": "llama", - "llama.context_length": uint32(32), - "llama.embedding_length": uint32(4096), - "llama.block_count": uint32(inputLayerCount), - "llama.attention.head_count": uint32(32), - "llama.attention.head_count_kv": uint32(32), - "tokenizer.ggml.tokens": []string{" "}, - "tokenizer.ggml.scores": []float32{0}, - "tokenizer.ggml.token_type": []int32{0}, - }, tensors) - require.NoError(t, err) - - ggml, err := LoadModel(f.Name(), 0) - if err != nil { - t.Fatal(err) - } - - // Simple CPU scenario - gpus := []ml.DeviceInfo{} - projectors := []string{} - opts := api.DefaultOptions() - t.Run("cpu", func(t *testing.T) { - estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1) - assert.Equal(t, 0, estimate.Layers) - assert.Equal(t, uint64(0), estimate.Graph) - }) - - // derived from the dummy ggml file above - graphPartialOffload := uint64(202377216) - graphFullOffload := uint64(171968512) - layerSize := uint64(33554436) - projectorSize := uint64(0) - memoryLayerOutput := uint64(4) - - // Dual CUDA scenario with asymmetry - gpuMinimumMemory := uint64(457 * format.MebiByte) - gpus = []ml.DeviceInfo{ - { - DeviceID: ml.DeviceID{ - Library: "CUDA", - }, - }, - { - DeviceID: ml.DeviceID{ - Library: "CUDA", - }, - }, - } - // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 - for i, s := range []struct { - layer0, layer1 uint64 - expect0, expect1 int - }{ - {1, 1, 1, 1}, - {2, 1, 2, 1}, - {2, 2, 2, 2}, - {1, 2, 1, 2}, - {3, 3, 3, 3}, - {4, 4, 3, 3}, - {6, 6, 3, 3}, - {0, 3, 0, 3}, - } { - t.Run(fmt.Sprintf("%v", s), func(t *testing.T) { - gpus[0].FreeMemory = 0 - gpus[1].FreeMemory = 0 - gpus[0].FreeMemory += projectorSize - if s.layer0 > 0 { - gpus[0].FreeMemory += memoryLayerOutput - } else { - gpus[1].FreeMemory += memoryLayerOutput - } - gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1 - gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1 - gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) - gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) - estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1) - assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s) - assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s) - var layerSums uint64 - for _, b := range estimate.GPUSizes { - layerSums += b - } - if estimate.Layers < inputLayerCount+1 { - assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) - assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) - } else { - assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) - assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) - } - }) - } -} diff --git a/llm/server.go b/llm/server.go index 87f97a010f..83690fcdc1 100644 --- a/llm/server.go +++ b/llm/server.go @@ -92,7 +92,8 @@ type llmServer struct { numParallel int modelPath string - loadRequest LoadRequest // Parameters used to initialize the runner + loadRequest LoadRequest // Parameters used to initialize the runner + mem *ml.BackendMemory // Memory allocations for this model // llamaModel is an instance of the cgo llama.cpp model definition // nil if this server is running the new engine @@ -113,15 +114,11 @@ type llmServer struct { type llamaServer struct { llmServer - ggml *ggml.GGML - gpus []ml.DeviceInfo // The set of GPUs covered by the memory estimate - estimate MemoryEstimate + ggml *ggml.GGML } type ollamaServer struct { llmServer - - mem *ml.BackendMemory } // LoadModel will load a model from disk. The model must be in the GGML format. @@ -463,169 +460,226 @@ type LoadResponse struct { var ErrLoadRequiredFull = errors.New("unable to load full model on GPU") -func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) { - systemTotalMemory := systemInfo.TotalMemory - systemFreeMemory := systemInfo.FreeMemory - systemSwapFreeMemory := systemInfo.FreeSwap - slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) +func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) { + slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU) - if len(gpus) == 0 || s.options.NumGPU == 0 { - if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) { - slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) - return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull) + gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...) + + // Synthesize memory allocation information based on our estimates + s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{ + Name: "CPU", + Weights: make([]uint64, s.totalLayers), + Cache: make([]uint64, s.totalLayers), + }, GPUs: make([]ml.DeviceMemory, len(gpus))} + + for i := range s.mem.GPUs { + s.mem.GPUs[i].Name = gpus[i].Name + s.mem.GPUs[i].DeviceID = gpus[i].DeviceID + s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers) + s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers) + } + + kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize), + s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention) + + // Use the size of one layer as a buffer + layers := s.ggml.Tensors().GroupLayers() + if blk0, ok := layers["blk.0"]; ok { + for i := range gpus { + gpus[i].FreeMemory -= blk0.Size() + kv[0] } } else { - g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) - if g == nil { - if !requireFull { - g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel) - } else { - slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate) - return nil, ErrLoadRequiredFull + slog.Warn("model missing blk.0 layer size") + } + + // Assign all the layers to the CPU for now, they will get reassigned later + for i := range s.ggml.KV().BlockCount() { + if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { + s.mem.CPU.Weights[i] = blk.Size() + s.mem.CPU.Cache[i] += kv[i] + } + } + + // We historically haven't included InputWeights in the model size + var outputWeights uint64 + if layer, ok := layers["output_norm"]; ok { + outputWeights += layer.Size() + } + if layer, ok := layers["output"]; ok { + outputWeights += layer.Size() + } else if layer, ok := layers["token_embd"]; ok { + outputWeights += layer.Size() + } + s.mem.CPU.Weights[s.totalLayers-1] = outputWeights + + // The vision projector is always loaded on the first GPU if available. + // This can't be assigned by us, so just subtract it from free space + projectorGPU := -1 + var projectorWeights uint64 + if len(gpus) > 0 { + for _, projector := range s.loadRequest.LoraPath { + projectorWeights += projectorMemoryRequirements(projector) + } + + // llama.cpp uses the first discrete GPU if available, otherwise the first iGPU + firstIntegrated := -1 + for i := range gpus { + if !gpus[i].Integrated { + projectorGPU = i + break + } + if firstIntegrated == -1 { + firstIntegrated = i } } - gpus = g - } - - s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel) - - if len(gpus) >= 1 { - switch { - case s.options.NumGPU == 0: - gpus = []ml.DeviceInfo{} - case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory: - // disable partial offloading when model is greater than total system memory as this - // can lead to locking up the system - s.options.NumGPU = 0 - gpus = []ml.DeviceInfo{} - case gpus[0].Library != "Metal" && s.estimate.Layers == 0: - // Don't bother loading into the GPU if no layers can fit - gpus = []ml.DeviceInfo{} - case s.options.NumGPU < 0 && s.estimate.Layers > 0: - s.options.NumGPU = s.estimate.Layers + if projectorGPU == -1 { + projectorGPU = firstIntegrated } - } else { - s.options.NumGPU = 0 + + gpus[projectorGPU].FreeMemory -= projectorWeights } - // On linux and windows, over-allocating CPU memory will almost always result in an error - // Darwin has fully dynamic swap so has no direct concept of free swap space - if runtime.GOOS != "darwin" { - systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize - available := systemInfo.FreeMemory + systemInfo.FreeSwap - if systemMemoryRequired > available { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap)) - return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available)) + var kvTotal uint64 + for _, kvLayer := range kv { + kvTotal += kvLayer + } + + if graphPartialOffload == 0 { + headsKV := s.ggml.KV().HeadCountKVMin() + if headsKV == 0 { + headsKV = 1 + } + gqa := s.ggml.KV().HeadCountMax() / headsKV + graphPartialOffload = gqa * kvTotal / 6 + } + if graphFullOffload == 0 { + graphFullOffload = graphPartialOffload + } + + // On Metal there's no partial offload overhead + if len(gpus) > 0 && gpus[0].Library == "Metal" { + graphPartialOffload = graphFullOffload + } + + // Create a layout based on the memory data that we've built. The compute graph + // for GPUs is iteratively assigned based on the number of GPUs that are required. + var gpuLayers ml.GPULayersList + for { + prevGPULayers := gpuLayers + + var err error + gpuLayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, 0) + if err != nil { + return nil, err + } + + if len(gpuLayers) > len(prevGPULayers) { + for _, gl := range gpuLayers { + for i := range s.mem.GPUs { + if gl.DeviceID == s.mem.GPUs[i].DeviceID { + s.mem.GPUs[i].Graph = max(graphPartialOffload, graphFullOffload) + break + } + } + } + } else { + break } } - slog.Info("offload", "", s.estimate) + // This maintains the historical assignment of graph sizes, though it isn't fully accurate + graphSize := graphFullOffload + if gpuLayers.Sum() < int(s.totalLayers) { + graphSize = graphPartialOffload + } - s.gpus = gpus - s.loadRequest.GPULayers = createGPULayers(s.estimate, s.ggml, gpus, s.options.NumGPU) + // For all layers that we have assigned to GPUs, move them in the memory data so + // that it is reported accurately + for _, gl := range gpuLayers { + for i := range s.mem.GPUs { + if gl.DeviceID == s.mem.GPUs[i].DeviceID { + for _, l := range gl.Layers { + s.mem.GPUs[i].Weights[l] = s.mem.CPU.Weights[l] + s.mem.GPUs[i].Cache[l] = s.mem.CPU.Cache[l] - // Mmap is only supported on the llama engine - if s.textProcessor == nil { - s.loadRequest.UseMmap = true + s.mem.CPU.Weights[l] = 0 + s.mem.CPU.Cache[l] = 0 + } - // mmap has issues with partial offloading on metal - for _, g := range gpus { - if g.Library == "Metal" && - uint64(s.options.NumGPU) > 0 && - uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 { - s.options.UseMMap = new(bool) - *s.options.UseMMap = false + s.mem.GPUs[i].Graph = graphSize + break } } + } - // Windows CUDA should not use mmap for best performance - // Linux with a model larger than free space, mmap leads to thrashing - // For CPU loads we want the memory to be allocated, not FS cache - if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || - (runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) || - (len(gpus) == 0 && s.options.UseMMap == nil) || - (len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) || - (s.options.UseMMap != nil && !*s.options.UseMMap) { - s.loadRequest.UseMmap = false + if projectorGPU > 0 && len(s.mem.GPUs[projectorGPU].Weights) > 0 { + s.mem.GPUs[projectorGPU].Weights[s.totalLayers-1] += projectorWeights + } + + slog.Debug("memory", "estimate", s.mem) + s.mem.Log(slog.LevelInfo) + + // The llama engine uses mmap by default + s.loadRequest.UseMmap = true + + // mmap has issues with partial offloading on metal + for _, g := range gpus { + if g.Library == "Metal" && + uint64(s.options.NumGPU) > 0 && + uint64(s.options.NumGPU) < s.totalLayers { + s.options.UseMMap = new(bool) + *s.options.UseMMap = false } } + // Windows CUDA should not use mmap for best performance + // Linux with a model larger than free space, mmap leads to thrashing + // For CPU loads we want the memory to be allocated, not FS cache + if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) || + (runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) || + (len(gpus) == 0 && s.options.UseMMap == nil) || + (len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) || + (s.options.UseMMap != nil && !*s.options.UseMMap) { + s.loadRequest.UseMmap = false + } + if err := s.waitUntilRunnerLaunched(ctx); err != nil { return nil, err } + s.loadRequest.GPULayers = gpuLayers resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit) if err != nil { return nil, err } - // On the Ollama engine, we can print out a summary of the memory allocations. - // We don't have this for the llama engine but it does something similar itself. - if s.textProcessor != nil { - resp.Memory.Log(slog.LevelInfo) - } - if !resp.Success { - slog.Warn("failed to allocate memory for model", "memory", resp.Memory) return nil, errors.New("failed to allocate memory for model") } // The llama engine does its memory allocations together with model loading, so we // need to wait until it is done to ensure that we have accurate memory data before // loading the next model - if s.textProcessor == nil { - return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx) - } else { - return uniqueDeviceIDs(s.loadRequest.GPULayers), nil - } + return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx) } -// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment -// of particular layers onto GPUs -func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList { - if numGPU <= 0 || len(gpus) == 0 { - return nil +func projectorMemoryRequirements(filename string) (weights uint64) { + file, err := os.Open(filename) + if err != nil { + return 0 + } + defer file.Close() + + ggml, err := ggml.Decode(file, 1024) + if err != nil { + return 0 } - gpuLayers := make(ml.GPULayersList, len(gpus)) - for i := range gpuLayers { - gpuLayers[i].DeviceID = gpus[i].DeviceID + for _, layer := range ggml.Tensors().GroupLayers() { + weights += layer.Size() } - var sum float32 - splits := make([]float32, len(estimate.TensorSplit)) - // cumulative sum of all splits - for i := range splits { - sum += float32(estimate.TensorSplit[i]) - splits[i] = sum - } - - if sum <= 0 { - return nil - } - - // normalize splits - for i := range splits { - splits[i] /= sum - } - - blocks := int(ggml.KV().BlockCount()) - gpuRangeStart := max(0, blocks-numGPU) - gpuRangeStop := min(gpuRangeStart+numGPU, blocks+1) - for i := range blocks + 1 { - if i < gpuRangeStart || i >= gpuRangeStop { - continue - } - - index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f }) - if index < 0 || index >= len(gpus) { - continue - } - - gpuLayers[index].Layers = append(gpuLayers[index].Layers, i) - } - - return gpuLayers + return weights } // Load finds the optimal layout of layers to offload on GPUs based on no initial information about the size of the model @@ -652,23 +706,6 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU) - systemTotalMemory := systemInfo.TotalMemory - systemFreeMemory := systemInfo.FreeMemory - systemSwapFreeMemory := systemInfo.FreeSwap - slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) - - for _, gpu := range gpus { - available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory() - if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() { - available = 0 - } - slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library, - "available", format.HumanBytes2(available), - "free", format.HumanBytes2(gpu.FreeMemory), - "minimum", format.HumanBytes2(gpu.MinimumMemory()), - "overhead", format.HumanBytes2(envconfig.GpuOverhead())) - } - pastAllocations := make(map[uint64]struct{}) var backoff float32 @@ -834,25 +871,22 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID { // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph // - Assigning layers // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory -func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) { +func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) { if memory == nil { memory = &ml.BackendMemory{CPU: ml.DeviceMemory{ Weights: make([]uint64, s.totalLayers), Cache: make([]uint64, s.totalLayers), }} } - gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff) - if err != nil { - return nil, err - } - err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) + gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff) + err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) if err != nil { return nil, err } return gpuLayers, nil } -func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) { +func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64) { gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...) sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus))) @@ -910,11 +944,11 @@ func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.Backen gpuLayers = libraryGpuLayers } } - return gpuLayers, layers, nil + return gpuLayers, layers } // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory -func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { +func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { // These sizes will only increase as we go through additional iterations and get additional information. cpuSize := memory.InputWeights + memory.CPU.Graph var vramSize uint64 @@ -942,11 +976,13 @@ nextLayer: if requireFull { if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { + slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) return ErrLoadRequiredFull } if cpuSize > systemInfo.FreeMemory { - return ErrLoadRequiredFull + slog.Info("model requires more system memory than is currently available, evicting a model to make space", "required", cpuSize, "free", systemInfo.FreeMemory) + return fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull) } } @@ -1734,31 +1770,12 @@ func (s *llmServer) Close() error { return nil } -func (s *llamaServer) VRAMSize() uint64 { - return s.estimate.VRAMSize -} - -func (s *llamaServer) TotalSize() uint64 { - return s.estimate.TotalSize -} - -func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 { - for i, gpu := range s.gpus { - if gpu.DeviceID == id { - if i < len(s.estimate.GPUSizes) { - return s.estimate.GPUSizes[i] - } - } - } - return 0 -} - func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { slog.Debug("llamarunner free vram reporting not supported") return nil } -func (s *ollamaServer) VRAMSize() uint64 { +func (s *llmServer) VRAMSize() uint64 { if s.mem == nil { return 0 } @@ -1786,7 +1803,7 @@ func (s *ollamaServer) VRAMSize() uint64 { return mem } -func (s *ollamaServer) TotalSize() uint64 { +func (s *llmServer) TotalSize() uint64 { if s.mem == nil { return 0 } @@ -1800,7 +1817,7 @@ func (s *ollamaServer) TotalSize() uint64 { return mem } -func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 { +func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 { if s.mem == nil { return 0 } diff --git a/server/sched.go b/server/sched.go index 5ae42efd10..c5bc6692da 100644 --- a/server/sched.go +++ b/server/sched.go @@ -437,6 +437,23 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo s.loadedMu.Unlock() + systemTotalMemory := systemInfo.TotalMemory + systemFreeMemory := systemInfo.FreeMemory + systemSwapFreeMemory := systemInfo.FreeSwap + slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) + + for _, gpu := range gpus { + available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory() + if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() { + available = 0 + } + slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library, + "available", format.HumanBytes2(available), + "free", format.HumanBytes2(gpu.FreeMemory), + "minimum", format.HumanBytes2(gpu.MinimumMemory()), + "overhead", format.HumanBytes2(envconfig.GpuOverhead())) + } + gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull) if err != nil { if errors.Is(err, llm.ErrLoadRequiredFull) {