diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go
index 909104c642..0b5d37a7bd 100644
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -797,73 +797,6 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	return
 }
 
-func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
-	if llm.KV().Uint("vision.block_count") == 0 {
-		return
-	}
-
-	for name, layer := range llm.Tensors().GroupLayers() {
-		if name == "v" || strings.HasPrefix(name, "v.") {
-			for _, tensor := range layer {
-				weights += tensor.Size()
-			}
-		}
-	}
-
-	imageSize := uint64(llm.KV().Uint("vision.image_size"))
-	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
-	if patchSize == 0 {
-		slog.Warn("unknown patch size for vision model")
-		return
-	}
-
-	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
-
-	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
-	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
-		numPatches++
-	}
-
-	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
-	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
-
-	switch llm.KV().Architecture() {
-	case "mllama":
-		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
-
-		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
-
-		graphSize = 4 * (8 +
-			imageSize*imageSize*numChannels*maxNumTiles +
-			embeddingLength*numPatches*maxNumTiles +
-			9*embeddingLength*numPaddedPatches*maxNumTiles +
-			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
-	case "gemma3", "mistral3":
-		graphSize = 4 * (imageSize*imageSize*numChannels +
-			embeddingLength*patchSize +
-			numPatches*numPatches*headCount)
-	case "qwen25vl":
-		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
-
-		numPatches := maxPixels / (patchSize * patchSize)
-
-		graphSize = 4 * (maxPixels*numChannels + // Original image storage
-			// Normalized pixels
-			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations
-			numPatches*numPatches*headCount +
-			// Additional buffer for processing
-			embeddingLength*numPatches)
-	case "llama4":
-		// vision graph is computed independently in the same schedule
-		// and is negligible compared to the worst case text graph
-	}
-
-	return weights, graphSize
-}
-
 // SupportsKVCacheType checks if the requested cache type is supported
 func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	if cacheType == "" || cacheType == "f16" {
diff --git a/llm/memory.go b/llm/memory.go
deleted file mode 100644
index 15558109f6..0000000000
--- a/llm/memory.go
+++ /dev/null
@@ -1,516 +0,0 @@
-package llm
-
-import (
-	"fmt"
-	"log/slog"
-	"os"
-	"slices"
-	"sort"
-	"strings"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/ml"
-)
-
-// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
-// The list of GPUs returned will always be the same brand (library)
-// If the model can not be fit fully within the available GPU(s) nil is returned
-func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
-	for _, gl := range ml.ByLibrary(gpus) {
-		sgl := append(make([]ml.DeviceInfo, 0, len(gl)), gl...)
-
-		// TODO - potentially sort by performance capability, existing models loaded, etc.
-		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(ml.ByFreeMemory(sgl)))
-
-		if !envconfig.SchedSpread() {
-			// Try to pack into as few GPUs as possible, starting from 1 GPU
-			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
-				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
-
-				if ok {
-					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
-						"model", modelPath,
-						"library", sgl[0].Library,
-						"parallel", numParallel,
-						"required", format.HumanBytes2(estimatedVRAM),
-						"gpus", numGPUs)
-					return gpuSubset
-				}
-			}
-		} else {
-			// TODO future refinements
-			// - if multiple Libraries, see if any single GPU in any Library will fit
-			// - try subsets of GPUs instead of just falling back to 1 or all in a family
-
-			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
-				slog.Info("new model will fit in available VRAM, loading",
-					"model", modelPath,
-					"library", sgl[0].Library,
-					"parallel", numParallel,
-					"required", format.HumanBytes2(estimatedVRAM),
-					"gpus", len(sgl))
-				return sgl
-			}
-		}
-	}
-	return nil
-}
-
-// If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []string, opts api.Options, gpus []ml.DeviceInfo, numParallel int) []ml.DeviceInfo {
-	byLibrary := ml.ByLibrary(gpus)
-	if len(byLibrary) <= 1 {
-		return gpus
-	}
-	var bestEstimate uint64
-	var bestFit int
-	for i, gl := range byLibrary {
-		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
-		if estimatedVRAM > bestEstimate {
-			bestEstimate = estimatedVRAM
-			bestFit = i
-		}
-	}
-	return byLibrary[bestFit]
-}
-
-// This algorithm looks for a complete fit to determine if we need to unload other models
-func predictServerFit(allGpus []ml.DeviceInfo, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
-	// Split up the GPUs by type and try them
-	var estimatedVRAM uint64
-	for _, gpus := range ml.ByLibrary(allGpus) {
-		var layerCount int
-		estimate := estimateGPULayers(gpus, f, projectors, opts, numParallel)
-		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
-		if opts.NumGPU < 0 {
-			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
-				return true, estimatedVRAM
-			}
-		} else {
-			if layerCount > 0 && layerCount >= opts.NumGPU {
-				return true, estimatedVRAM
-			}
-		}
-	}
-	return false, estimatedVRAM
-}
-
-func verifyCPUFit(f *ggml.GGML, modelPath string, projectors []string, adapters []string, opts api.Options, systemInfo ml.SystemInfo, numParallel int) bool {
-	estimate := estimateGPULayers(nil, f, projectors, opts, numParallel)
-	if estimate.TotalSize > systemInfo.FreeMemory {
-		return false
-	}
-	slog.Info("new model will fit in available system memory for CPU inference, loading",
-		"model", modelPath,
-		"parallel", numParallel,
-		"required", format.HumanBytes2(estimate.TotalSize),
-	)
-	return true
-}
-
-type MemoryEstimate struct {
-	// How many layers we predict we can load
-	Layers int
-
-	// The size of the graph which occupies the main GPU
-	Graph uint64
-
-	// How much VRAM will be allocated given the number of layers we predict
-	VRAMSize uint64
-
-	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
-	TotalSize uint64
-
-	// For multi-GPU scenarios, this provides the tensor split parameter
-	TensorSplit []int
-
-	// For multi-GPU scenarios, this is the size in bytes per GPU
-	GPUSizes []uint64
-
-	// internal fields for logging purposes
-	inferenceLibrary    string
-	layersRequested     int
-	layersModel         int
-	availableList       []string
-	kv                  uint64
-	allocationsList     []string
-	memoryWeights       uint64
-	memoryLayerOutput   uint64
-	graphFullOffload    uint64
-	graphPartialOffload uint64
-
-	projectorWeights, projectorGraph uint64
-}
-
-// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
-// The GPUs provided must all be the same Library
-func estimateGPULayers(gpus []ml.DeviceInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
-	// Graph size for a partial offload, applies to all GPUs
-	var graphPartialOffload uint64
-
-	// Graph size when all layers are offloaded, applies to all GPUs
-	var graphFullOffload uint64
-
-	// Final graph offload once we know full or partial
-	var graphOffload uint64
-
-	// Projectors loaded into GPU0 only
-	var llamaEngineProjectorWeights uint64
-
-	// Projectors loaded with output layer
-	var ollamaEngineProjectorWeights uint64
-	var ollamaEngineProjectorGraph uint64
-
-	// Conditional output size on GPU 0
-	var memoryLayerOutput uint64
-
-	// The sizes of a layer
-	var layerSize uint64
-
-	// The sum of all the layer sizes (just for logging)
-	var memoryWeights uint64
-
-	// True if all the layers are loaded
-	var fullyLoaded bool
-
-	// Overflow that didn't fit into the GPU
-	var overflow uint64
-
-	overhead := envconfig.GpuOverhead()
-	availableList := make([]string, len(gpus))
-	libraries := []string{}
-	for i, gpu := range gpus {
-		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
-		if !slices.Contains(libraries, gpu.Library) {
-			libraries = append(libraries, gpu.Library)
-		}
-	}
-	if len(libraries) == 0 {
-		libraries = []string{"cpu"}
-	}
-	slog.Debug("evaluating", "library", strings.Join(libraries, ","), "gpu_count", len(gpus), "available", availableList)
-
-	for _, projector := range projectors {
-		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)
-	}
-	if llamaEngineProjectorWeights == 0 {
-		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
-	}
-
-	layers := f.Tensors().GroupLayers()
-	// add one layer worth of memory as a buffer
-	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.Size()
-	} else {
-		slog.Warn("model missing blk.0 layer size")
-	}
-
-	useFlashAttention := envconfig.FlashAttention(f.FlashAttention()) &&
-		ml.FlashAttentionSupported(gpus) &&
-		f.SupportsFlashAttention()
-
-	var kvct string
-	if useFlashAttention {
-		requested := strings.ToLower(envconfig.KvCacheType())
-		if f.SupportsKVCacheType(requested) {
-			kvct = requested
-		}
-	}
-
-	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct, useFlashAttention)
-
-	if len(kv) > 0 {
-		layerSize += kv[0]
-	}
-
-	var kvTotal uint64
-	for _, kvLayer := range kv {
-		kvTotal += kvLayer
-	}
-
-	if graphPartialOffload == 0 {
-		headsKV := f.KV().HeadCountKVMin()
-		if headsKV == 0 {
-			headsKV = 1
-		}
-		gqa := f.KV().HeadCountMax() / headsKV
-		graphPartialOffload = gqa * kvTotal / 6
-	}
-	if graphFullOffload == 0 {
-		graphFullOffload = graphPartialOffload
-	}
-
-	// on metal there's no partial offload overhead
-	if len(gpus) > 0 && gpus[0].Library == "Metal" {
-		graphPartialOffload = graphFullOffload
-	} else if len(gpus) > 1 {
-		// multigpu should always use the partial graph size
-		graphFullOffload = graphPartialOffload
-	}
-
-	// Output layer handled at the end if we have space
-	if layer, ok := layers["output_norm"]; ok {
-		memoryLayerOutput += layer.Size()
-	}
-	if layer, ok := layers["output"]; ok {
-		memoryLayerOutput += layer.Size()
-	} else if layer, ok := layers["token_embd"]; ok {
-		memoryLayerOutput += layer.Size()
-	}
-
-	gpuZeroOverhead := llamaEngineProjectorWeights
-
-	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
-	var layerCount int
-	tensorSplit := make([]int, len(gpus))
-	gpuAllocations := make([]uint64, len(gpus))
-	type gs struct {
-		i int
-		g *ml.DeviceInfo
-	}
-	gpusWithSpace := []gs{}
-	for i := range gpus {
-		var gzo uint64
-		if len(gpusWithSpace) == 0 {
-			gzo = gpuZeroOverhead
-		}
-		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory()+2*layerSize {
-			slog.Debug("gpu has too little memory to allocate any layers",
-				"id", gpus[i].ID,
-				"library", gpus[i].Library,
-				"compute", gpus[i].Compute(),
-				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
-				"name", gpus[i].Name,
-				"total", format.HumanBytes2(gpus[i].TotalMemory),
-				"available", format.HumanBytes2(gpus[i].FreeMemory),
-				"minimum_memory", gpus[i].MinimumMemory,
-				"layer_size", format.HumanBytes2(layerSize),
-				"gpu_zer_overhead", format.HumanBytes2(gzo),
-				"partial_offload", format.HumanBytes2(graphPartialOffload),
-				"full_offload", format.HumanBytes2(graphFullOffload),
-			)
-			continue
-		}
-		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
-		gpuAllocations[i] += gpus[i].MinimumMemory() + layerSize // We hold off on graph until we know partial vs. full
-	}
-
-	var gpuZeroID int
-	if len(gpusWithSpace) > 0 {
-		gpuZeroID = gpusWithSpace[0].i
-		gpuAllocations[gpuZeroID] += gpuZeroOverhead
-	} else {
-		overflow += gpuZeroOverhead
-	}
-
-	// For all the layers, find where they can fit on the GPU(s)
-	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
-		// Some models have inconsistent layer sizes
-		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			layerSize = blk.Size()
-			layerSize += kv[i]
-			memoryWeights += blk.Size()
-		}
-
-		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
-			// Stop allocating on GPU(s) once we hit the users target NumGPU
-			overflow += layerSize
-			continue
-		}
-
-		// distribute the layers across the GPU(s) that have space
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[i%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+layerSize {
-				gpuAllocations[g.i] += layerSize
-				tensorSplit[g.i]++
-				layerCount++
-				break
-			} else {
-				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
-			}
-		}
-
-		if len(gpusWithSpace) == 0 {
-			overflow += layerSize
-		}
-	}
-	if layerCount >= int(f.KV().BlockCount()) {
-		fullyLoaded = true
-	}
-
-	// Determine if we need to consider output then find where it fits
-	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
-	if memoryLastLayer > 0 {
-		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
-			for j := len(gpusWithSpace); j > 0; j-- {
-				g := gpusWithSpace[layerCount%j]
-				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-				if g.g.FreeMemory > overhead+used+memoryLastLayer {
-					gpuAllocations[g.i] += memoryLastLayer
-					tensorSplit[g.i]++
-					layerCount++
-					break
-				}
-			}
-		}
-
-		if layerCount < int(f.KV().BlockCount())+1 {
-			fullyLoaded = false
-			overflow += memoryLastLayer
-		}
-	}
-
-	// Add the applicable (full or partial) graph allocations
-	for i := range gpus {
-		if tensorSplit[i] <= 0 {
-			continue
-		}
-		if fullyLoaded {
-			gpuAllocations[i] += graphFullOffload
-		} else {
-			gpuAllocations[i] += graphPartialOffload
-		}
-	}
-	if fullyLoaded {
-		graphOffload = graphFullOffload
-	} else {
-		graphOffload = graphPartialOffload
-	}
-
-	// Summaries for the log
-	var memoryRequiredPartial, memoryRequiredTotal uint64
-	for i := range gpuAllocations {
-		memoryRequiredPartial += gpuAllocations[i]
-	}
-	memoryRequiredTotal = memoryRequiredPartial + overflow
-
-	allocationsList := []string{}
-	for _, a := range gpuAllocations {
-		allocationsList = append(allocationsList, format.HumanBytes2(a))
-	}
-
-	estimate := MemoryEstimate{
-		TotalSize: memoryRequiredTotal,
-		Layers:    0,
-		Graph:     0,
-		VRAMSize:  0,
-		GPUSizes:  []uint64{},
-
-		inferenceLibrary:    strings.Join(libraries, ","),
-		layersRequested:     opts.NumGPU,
-		layersModel:         int(f.KV().BlockCount()) + 1,
-		availableList:       availableList,
-		kv:                  kvTotal,
-		allocationsList:     allocationsList,
-		memoryWeights:       memoryWeights,
-		memoryLayerOutput:   memoryLayerOutput,
-		graphFullOffload:    graphFullOffload,
-		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
-		projectorGraph:      ollamaEngineProjectorGraph,
-	}
-
-	if len(gpus) == 0 {
-		return estimate
-	}
-	if layerCount == 0 {
-		slog.Debug("insufficient VRAM to load any model layers")
-		return estimate
-	}
-	estimate.Layers = layerCount
-	estimate.Graph = graphOffload
-	estimate.VRAMSize = memoryRequiredPartial
-	estimate.TotalSize = memoryRequiredTotal
-	estimate.TensorSplit = tensorSplit
-	estimate.GPUSizes = gpuAllocations
-	return estimate
-}
-
-func (m MemoryEstimate) LogValue() slog.Value {
-	attrs := []slog.Attr{
-		slog.String("library", m.inferenceLibrary),
-		slog.Group(
-			"layers",
-			// requested number of layers to offload
-			"requested", m.layersRequested,
-			// The number of layers the model has (including output)
-			"model", m.layersModel,
-			// estimated number of layers that can be offloaded
-			"offload", m.Layers,
-			// multi-gpu split for tensors
-			"split", m.TensorSplit,
-		),
-		slog.Group(
-			"memory",
-			// memory available by GPU for offloading
-			"available", m.availableList,
-			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
-			slog.Group(
-				"required",
-				// memory required for full offloading
-				"full", format.HumanBytes2(m.TotalSize),
-				// memory required to offload layers.estimate layers
-				"partial", format.HumanBytes2(m.VRAMSize),
-				// memory of KV cache
-				"kv", format.HumanBytes2(m.kv),
-				// Allocations across the GPUs
-				"allocations", m.allocationsList,
-			),
-			slog.Group(
-				"weights",
-				// memory of the weights
-				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
-				// memory of repeating layers
-				"repeating", format.HumanBytes2(m.memoryWeights),
-				// memory of non-repeating layers
-				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
-			),
-			slog.Group(
-				"graph",
-				// memory of graph when fully offloaded
-				"full", format.HumanBytes2(m.graphFullOffload),
-				// memory of graph when not fully offloaded
-				"partial", format.HumanBytes2(m.graphPartialOffload),
-			),
-		),
-	}
-
-	if m.projectorWeights > 0 {
-		attrs = append(attrs, slog.Group(
-			"projector",
-			"weights", format.HumanBytes2(m.projectorWeights),
-			"graph", format.HumanBytes2(m.projectorGraph),
-		))
-	}
-
-	return slog.GroupValue(attrs...)
-}
-
-func projectorMemoryRequirements(filename string) (weights uint64) {
-	file, err := os.Open(filename)
-	if err != nil {
-		return 0
-	}
-	defer file.Close()
-
-	ggml, err := ggml.Decode(file, 1024)
-	if err != nil {
-		return 0
-	}
-
-	for _, layer := range ggml.Tensors().GroupLayers() {
-		weights += layer.Size()
-	}
-
-	return weights
-}
diff --git a/llm/memory_test.go b/llm/memory_test.go
deleted file mode 100644
index fce17b9c25..0000000000
--- a/llm/memory_test.go
+++ /dev/null
@@ -1,130 +0,0 @@
-package llm
-
-import (
-	"bytes"
-	"fmt"
-	"os"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/ml"
-)
-
-func TestEstimateGPULayers(t *testing.T) {
-	t.Setenv("OLLAMA_DEBUG", "1")
-	t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
-	t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048")
-
-	modelName := "dummy"
-	f, err := os.CreateTemp(t.TempDir(), modelName)
-	require.NoError(t, err)
-	defer f.Close()
-	inputLayerCount := 5
-
-	tensors := []*ggml.Tensor{
-		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	}
-	assert.Len(t, tensors, inputLayerCount+1)
-	err = ggml.WriteGGUF(f, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.context_length":          uint32(32),
-		"llama.embedding_length":        uint32(4096),
-		"llama.block_count":             uint32(inputLayerCount),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(32),
-		"tokenizer.ggml.tokens":         []string{" "},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, tensors)
-	require.NoError(t, err)
-
-	ggml, err := LoadModel(f.Name(), 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// Simple CPU scenario
-	gpus := []ml.DeviceInfo{}
-	projectors := []string{}
-	opts := api.DefaultOptions()
-	t.Run("cpu", func(t *testing.T) {
-		estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
-		assert.Equal(t, 0, estimate.Layers)
-		assert.Equal(t, uint64(0), estimate.Graph)
-	})
-
-	// derived from the dummy ggml file above
-	graphPartialOffload := uint64(202377216)
-	graphFullOffload := uint64(171968512)
-	layerSize := uint64(33554436)
-	projectorSize := uint64(0)
-	memoryLayerOutput := uint64(4)
-
-	// Dual CUDA scenario with asymmetry
-	gpuMinimumMemory := uint64(457 * format.MebiByte)
-	gpus = []ml.DeviceInfo{
-		{
-			DeviceID: ml.DeviceID{
-				Library: "CUDA",
-			},
-		},
-		{
-			DeviceID: ml.DeviceID{
-				Library: "CUDA",
-			},
-		},
-	}
-	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
-	for i, s := range []struct {
-		layer0, layer1   uint64
-		expect0, expect1 int
-	}{
-		{1, 1, 1, 1},
-		{2, 1, 2, 1},
-		{2, 2, 2, 2},
-		{1, 2, 1, 2},
-		{3, 3, 3, 3},
-		{4, 4, 3, 3},
-		{6, 6, 3, 3},
-		{0, 3, 0, 3},
-	} {
-		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
-			gpus[0].FreeMemory = 0
-			gpus[1].FreeMemory = 0
-			gpus[0].FreeMemory += projectorSize
-			if s.layer0 > 0 {
-				gpus[0].FreeMemory += memoryLayerOutput
-			} else {
-				gpus[1].FreeMemory += memoryLayerOutput
-			}
-			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
-			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
-			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-			estimate := estimateGPULayers(gpus, ggml, projectors, opts, 1)
-			assert.Equal(t, s.expect0+s.expect1, estimate.Layers, "scenario %d: %v", i, s)
-			assert.Equal(t, []int{s.expect0, s.expect1}, estimate.TensorSplit, "scenario %d: %v", i, s)
-			var layerSums uint64
-			for _, b := range estimate.GPUSizes {
-				layerSums += b
-			}
-			if estimate.Layers < inputLayerCount+1 {
-				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-			} else {
-				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-			}
-		})
-	}
-}
diff --git a/llm/server.go b/llm/server.go
index 87f97a010f..83690fcdc1 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -92,7 +92,8 @@ type llmServer struct {
 	numParallel int
 	modelPath   string
 
-	loadRequest LoadRequest // Parameters used to initialize the runner
+	loadRequest LoadRequest       // Parameters used to initialize the runner
+	mem         *ml.BackendMemory // Memory allocations for this model
 
 	// llamaModel is an instance of the cgo llama.cpp model definition
 	// nil if this server is running the new engine
@@ -113,15 +114,11 @@ type llmServer struct {
 type llamaServer struct {
 	llmServer
 
-	ggml     *ggml.GGML
-	gpus     []ml.DeviceInfo // The set of GPUs covered by the memory estimate
-	estimate MemoryEstimate
+	ggml *ggml.GGML
 }
 
 type ollamaServer struct {
 	llmServer
-
-	mem *ml.BackendMemory
 }
 
 // LoadModel will load a model from disk. The model must be in the GGML format.
@@ -463,169 +460,226 @@ type LoadResponse struct {
 
 var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
 
-func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
-	systemTotalMemory := systemInfo.TotalMemory
-	systemFreeMemory := systemInfo.FreeMemory
-	systemSwapFreeMemory := systemInfo.FreeSwap
-	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
+	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
 
-	if len(gpus) == 0 || s.options.NumGPU == 0 {
-		if !verifyCPUFit(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, systemInfo, s.numParallel) {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-			return nil, fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
+	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
+
+	// Synthesize memory allocation information based on our estimates
+	s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
+		Name:    "CPU",
+		Weights: make([]uint64, s.totalLayers),
+		Cache:   make([]uint64, s.totalLayers),
+	}, GPUs: make([]ml.DeviceMemory, len(gpus))}
+
+	for i := range s.mem.GPUs {
+		s.mem.GPUs[i].Name = gpus[i].Name
+		s.mem.GPUs[i].DeviceID = gpus[i].DeviceID
+		s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
+		s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
+	}
+
+	kv, graphPartialOffload, graphFullOffload := s.ggml.GraphSize(uint64(s.options.NumCtx), uint64(s.loadRequest.BatchSize),
+		s.loadRequest.Parallel, s.loadRequest.KvCacheType, s.loadRequest.FlashAttention)
+
+	// Use the size of one layer as a buffer
+	layers := s.ggml.Tensors().GroupLayers()
+	if blk0, ok := layers["blk.0"]; ok {
+		for i := range gpus {
+			gpus[i].FreeMemory -= blk0.Size() + kv[0]
 		}
 	} else {
-		g := pickBestFullFitByLibrary(s.ggml, s.modelPath, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-		if g == nil {
-			if !requireFull {
-				g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
-			} else {
-				slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
-				return nil, ErrLoadRequiredFull
+		slog.Warn("model missing blk.0 layer size")
+	}
+
+	// Assign all the layers to the CPU for now, they will get reassigned later
+	for i := range s.ggml.KV().BlockCount() {
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			s.mem.CPU.Weights[i] = blk.Size()
+			s.mem.CPU.Cache[i] += kv[i]
+		}
+	}
+
+	// We historically haven't included InputWeights in the model size
+	var outputWeights uint64
+	if layer, ok := layers["output_norm"]; ok {
+		outputWeights += layer.Size()
+	}
+	if layer, ok := layers["output"]; ok {
+		outputWeights += layer.Size()
+	} else if layer, ok := layers["token_embd"]; ok {
+		outputWeights += layer.Size()
+	}
+	s.mem.CPU.Weights[s.totalLayers-1] = outputWeights
+
+	// The vision projector is always loaded on the first GPU if available.
+	// This can't be assigned by us, so just subtract it from free space
+	projectorGPU := -1
+	var projectorWeights uint64
+	if len(gpus) > 0 {
+		for _, projector := range s.loadRequest.LoraPath {
+			projectorWeights += projectorMemoryRequirements(projector)
+		}
+
+		// llama.cpp uses the first discrete GPU if available, otherwise the first iGPU
+		firstIntegrated := -1
+		for i := range gpus {
+			if !gpus[i].Integrated {
+				projectorGPU = i
+				break
+			}
+			if firstIntegrated == -1 {
+				firstIntegrated = i
 			}
 		}
-		gpus = g
-	}
-
-	s.estimate = estimateGPULayers(gpus, s.ggml, []string{s.loadRequest.ProjectorPath}, s.options, s.numParallel)
-
-	if len(gpus) >= 1 {
-		switch {
-		case s.options.NumGPU == 0:
-			gpus = []ml.DeviceInfo{}
-		case gpus[0].Library == "Metal" && s.estimate.VRAMSize > systemInfo.TotalMemory:
-			// disable partial offloading when model is greater than total system memory as this
-			// can lead to locking up the system
-			s.options.NumGPU = 0
-			gpus = []ml.DeviceInfo{}
-		case gpus[0].Library != "Metal" && s.estimate.Layers == 0:
-			// Don't bother loading into the GPU if no layers can fit
-			gpus = []ml.DeviceInfo{}
-		case s.options.NumGPU < 0 && s.estimate.Layers > 0:
-			s.options.NumGPU = s.estimate.Layers
+		if projectorGPU == -1 {
+			projectorGPU = firstIntegrated
 		}
-	} else {
-		s.options.NumGPU = 0
+
+		gpus[projectorGPU].FreeMemory -= projectorWeights
 	}
 
-	// On linux and windows, over-allocating CPU memory will almost always result in an error
-	// Darwin has fully dynamic swap so has no direct concept of free swap space
-	if runtime.GOOS != "darwin" {
-		systemMemoryRequired := s.estimate.TotalSize - s.estimate.VRAMSize
-		available := systemInfo.FreeMemory + systemInfo.FreeSwap
-		if systemMemoryRequired > available {
-			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", format.HumanBytes2(available), "total", format.HumanBytes2(systemInfo.TotalMemory), "free", format.HumanBytes2(systemInfo.FreeMemory), "swap", format.HumanBytes2(systemInfo.FreeSwap))
-			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available))
+	var kvTotal uint64
+	for _, kvLayer := range kv {
+		kvTotal += kvLayer
+	}
+
+	if graphPartialOffload == 0 {
+		headsKV := s.ggml.KV().HeadCountKVMin()
+		if headsKV == 0 {
+			headsKV = 1
+		}
+		gqa := s.ggml.KV().HeadCountMax() / headsKV
+		graphPartialOffload = gqa * kvTotal / 6
+	}
+	if graphFullOffload == 0 {
+		graphFullOffload = graphPartialOffload
+	}
+
+	// On Metal there's no partial offload overhead
+	if len(gpus) > 0 && gpus[0].Library == "Metal" {
+		graphPartialOffload = graphFullOffload
+	}
+
+	// Create a layout based on the memory data that we've built. The compute graph
+	// for GPUs is iteratively assigned based on the number of GPUs that are required.
+	var gpuLayers ml.GPULayersList
+	for {
+		prevGPULayers := gpuLayers
+
+		var err error
+		gpuLayers, err = s.createLayout(systemInfo, gpus, s.mem, requireFull, 0)
+		if err != nil {
+			return nil, err
+		}
+
+		if len(gpuLayers) > len(prevGPULayers) {
+			for _, gl := range gpuLayers {
+				for i := range s.mem.GPUs {
+					if gl.DeviceID == s.mem.GPUs[i].DeviceID {
+						s.mem.GPUs[i].Graph = max(graphPartialOffload, graphFullOffload)
+						break
+					}
+				}
+			}
+		} else {
+			break
 		}
 	}
 
-	slog.Info("offload", "", s.estimate)
+	// This maintains the historical assignment of graph sizes, though it isn't fully accurate
+	graphSize := graphFullOffload
+	if gpuLayers.Sum() < int(s.totalLayers) {
+		graphSize = graphPartialOffload
+	}
 
-	s.gpus = gpus
-	s.loadRequest.GPULayers = createGPULayers(s.estimate, s.ggml, gpus, s.options.NumGPU)
+	// For all layers that we have assigned to GPUs, move them in the memory data so
+	// that it is reported accurately
+	for _, gl := range gpuLayers {
+		for i := range s.mem.GPUs {
+			if gl.DeviceID == s.mem.GPUs[i].DeviceID {
+				for _, l := range gl.Layers {
+					s.mem.GPUs[i].Weights[l] = s.mem.CPU.Weights[l]
+					s.mem.GPUs[i].Cache[l] = s.mem.CPU.Cache[l]
 
-	// Mmap is only supported on the llama engine
-	if s.textProcessor == nil {
-		s.loadRequest.UseMmap = true
+					s.mem.CPU.Weights[l] = 0
+					s.mem.CPU.Cache[l] = 0
+				}
 
-		// mmap has issues with partial offloading on metal
-		for _, g := range gpus {
-			if g.Library == "Metal" &&
-				uint64(s.options.NumGPU) > 0 &&
-				uint64(s.options.NumGPU) < s.ggml.KV().BlockCount()+1 {
-				s.options.UseMMap = new(bool)
-				*s.options.UseMMap = false
+				s.mem.GPUs[i].Graph = graphSize
+				break
 			}
 		}
+	}
 
-		// Windows CUDA should not use mmap for best performance
-		// Linux  with a model larger than free space, mmap leads to thrashing
-		// For CPU loads we want the memory to be allocated, not FS cache
-		if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
-			(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.estimate.TotalSize && s.options.UseMMap == nil) ||
-			(len(gpus) == 0 && s.options.UseMMap == nil) ||
-			(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
-			(s.options.UseMMap != nil && !*s.options.UseMMap) {
-			s.loadRequest.UseMmap = false
+	if projectorGPU > 0 && len(s.mem.GPUs[projectorGPU].Weights) > 0 {
+		s.mem.GPUs[projectorGPU].Weights[s.totalLayers-1] += projectorWeights
+	}
+
+	slog.Debug("memory", "estimate", s.mem)
+	s.mem.Log(slog.LevelInfo)
+
+	// The llama engine uses mmap by default
+	s.loadRequest.UseMmap = true
+
+	// mmap has issues with partial offloading on metal
+	for _, g := range gpus {
+		if g.Library == "Metal" &&
+			uint64(s.options.NumGPU) > 0 &&
+			uint64(s.options.NumGPU) < s.totalLayers {
+			s.options.UseMMap = new(bool)
+			*s.options.UseMMap = false
 		}
 	}
 
+	// Windows CUDA should not use mmap for best performance
+	// Linux  with a model larger than free space, mmap leads to thrashing
+	// For CPU loads we want the memory to be allocated, not FS cache
+	if (runtime.GOOS == "windows" && len(gpus) > 0 && gpus[0].Library == "CUDA" && s.options.UseMMap == nil) ||
+		(runtime.GOOS == "linux" && systemInfo.FreeMemory < s.TotalSize() && s.options.UseMMap == nil) ||
+		(len(gpus) == 0 && s.options.UseMMap == nil) ||
+		(len(gpus) > 0 && gpus[0].Library == "Vulkan" && s.options.UseMMap == nil) ||
+		(s.options.UseMMap != nil && !*s.options.UseMMap) {
+		s.loadRequest.UseMmap = false
+	}
+
 	if err := s.waitUntilRunnerLaunched(ctx); err != nil {
 		return nil, err
 	}
 
+	s.loadRequest.GPULayers = gpuLayers
 	resp, err := s.initModel(ctx, s.loadRequest, LoadOperationCommit)
 	if err != nil {
 		return nil, err
 	}
 
-	// On the Ollama engine, we can print out a summary of the memory allocations.
-	// We don't have this for the llama engine but it does something similar itself.
-	if s.textProcessor != nil {
-		resp.Memory.Log(slog.LevelInfo)
-	}
-
 	if !resp.Success {
-		slog.Warn("failed to allocate memory for model", "memory", resp.Memory)
 		return nil, errors.New("failed to allocate memory for model")
 	}
 
 	// The llama engine does its memory allocations together with model loading, so we
 	// need to wait until it is done to ensure that we have accurate memory data before
 	// loading the next model
-	if s.textProcessor == nil {
-		return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
-	} else {
-		return uniqueDeviceIDs(s.loadRequest.GPULayers), nil
-	}
+	return uniqueDeviceIDs(s.loadRequest.GPULayers), s.WaitUntilRunning(ctx)
 }
 
-// createGPULayers maps from the tensor splits assigned by the memory estimates to explicit assignment
-// of particular layers onto GPUs
-func createGPULayers(estimate MemoryEstimate, ggml *ggml.GGML, gpus []ml.DeviceInfo, numGPU int) ml.GPULayersList {
-	if numGPU <= 0 || len(gpus) == 0 {
-		return nil
+func projectorMemoryRequirements(filename string) (weights uint64) {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
+	}
+	defer file.Close()
+
+	ggml, err := ggml.Decode(file, 1024)
+	if err != nil {
+		return 0
 	}
 
-	gpuLayers := make(ml.GPULayersList, len(gpus))
-	for i := range gpuLayers {
-		gpuLayers[i].DeviceID = gpus[i].DeviceID
+	for _, layer := range ggml.Tensors().GroupLayers() {
+		weights += layer.Size()
 	}
 
-	var sum float32
-	splits := make([]float32, len(estimate.TensorSplit))
-	// cumulative sum of all splits
-	for i := range splits {
-		sum += float32(estimate.TensorSplit[i])
-		splits[i] = sum
-	}
-
-	if sum <= 0 {
-		return nil
-	}
-
-	// normalize splits
-	for i := range splits {
-		splits[i] /= sum
-	}
-
-	blocks := int(ggml.KV().BlockCount())
-	gpuRangeStart := max(0, blocks-numGPU)
-	gpuRangeStop := min(gpuRangeStart+numGPU, blocks+1)
-	for i := range blocks + 1 {
-		if i < gpuRangeStart || i >= gpuRangeStop {
-			continue
-		}
-
-		index := slices.IndexFunc(splits, func(f float32) bool { return float32(i-gpuRangeStart)/float32(gpuRangeStop-gpuRangeStart) < f })
-		if index < 0 || index >= len(gpus) {
-			continue
-		}
-
-		gpuLayers[index].Layers = append(gpuLayers[index].Layers, i)
-	}
-
-	return gpuLayers
+	return weights
 }
 
 // Load finds the optimal layout of layers to offload on GPUs based on no initial information about the size of the model
@@ -652,23 +706,6 @@ func (s *ollamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, gpus
 
 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
 
-	systemTotalMemory := systemInfo.TotalMemory
-	systemFreeMemory := systemInfo.FreeMemory
-	systemSwapFreeMemory := systemInfo.FreeSwap
-	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
-
-	for _, gpu := range gpus {
-		available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
-		if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
-			available = 0
-		}
-		slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
-			"available", format.HumanBytes2(available),
-			"free", format.HumanBytes2(gpu.FreeMemory),
-			"minimum", format.HumanBytes2(gpu.MinimumMemory()),
-			"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
-	}
-
 	pastAllocations := make(map[uint64]struct{})
 	var backoff float32
 
@@ -834,25 +871,22 @@ func uniqueDeviceIDs(gpuLayers ml.GPULayersList) []ml.DeviceID {
 // - Calculating how much space each GPU has available for layers, based on free memory and space occupied by the graph
 // - Assigning layers
 // - Ensuring that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *ollamaServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
+func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, error) {
 	if memory == nil {
 		memory = &ml.BackendMemory{CPU: ml.DeviceMemory{
 			Weights: make([]uint64, s.totalLayers),
 			Cache:   make([]uint64, s.totalLayers),
 		}}
 	}
-	gpuLayers, layers, err := s.buildLayout(systemGPUs, memory, requireFull, backoff)
-	if err != nil {
-		return nil, err
-	}
-	err = s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
+	err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
 	if err != nil {
 		return nil, err
 	}
 	return gpuLayers, nil
 }
 
-func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64, error) {
+func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, backoff float32) (ml.GPULayersList, []uint64) {
 	gpus := append(make([]ml.DeviceInfo, 0, len(systemGPUs)), systemGPUs...)
 	sort.Sort(sort.Reverse(ml.ByFreeMemory(gpus)))
 
@@ -910,11 +944,11 @@ func (s *ollamaServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.Backen
 			gpuLayers = libraryGpuLayers
 		}
 	}
-	return gpuLayers, layers, nil
+	return gpuLayers, layers
 }
 
 // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *ollamaServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
+func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@@ -942,11 +976,13 @@ nextLayer:
 
 	if requireFull {
 		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
 			return ErrLoadRequiredFull
 		}
 
 		if cpuSize > systemInfo.FreeMemory {
-			return ErrLoadRequiredFull
+			slog.Info("model requires more system memory than is currently available, evicting a model to make space", "required", cpuSize, "free", systemInfo.FreeMemory)
+			return fmt.Errorf("model requires more system memory than is currently available %w", ErrLoadRequiredFull)
 		}
 	}
 
@@ -1734,31 +1770,12 @@ func (s *llmServer) Close() error {
 	return nil
 }
 
-func (s *llamaServer) VRAMSize() uint64 {
-	return s.estimate.VRAMSize
-}
-
-func (s *llamaServer) TotalSize() uint64 {
-	return s.estimate.TotalSize
-}
-
-func (s *llamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
-	for i, gpu := range s.gpus {
-		if gpu.DeviceID == id {
-			if i < len(s.estimate.GPUSizes) {
-				return s.estimate.GPUSizes[i]
-			}
-		}
-	}
-	return 0
-}
-
 func (s *llamaServer) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo {
 	slog.Debug("llamarunner free vram reporting not supported")
 	return nil
 }
 
-func (s *ollamaServer) VRAMSize() uint64 {
+func (s *llmServer) VRAMSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1786,7 +1803,7 @@ func (s *ollamaServer) VRAMSize() uint64 {
 	return mem
 }
 
-func (s *ollamaServer) TotalSize() uint64 {
+func (s *llmServer) TotalSize() uint64 {
 	if s.mem == nil {
 		return 0
 	}
@@ -1800,7 +1817,7 @@ func (s *ollamaServer) TotalSize() uint64 {
 	return mem
 }
 
-func (s *ollamaServer) VRAMByGPU(id ml.DeviceID) uint64 {
+func (s *llmServer) VRAMByGPU(id ml.DeviceID) uint64 {
 	if s.mem == nil {
 		return 0
 	}
diff --git a/server/sched.go b/server/sched.go
index 5ae42efd10..c5bc6692da 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -437,6 +437,23 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
 
 	s.loadedMu.Unlock()
 
+	systemTotalMemory := systemInfo.TotalMemory
+	systemFreeMemory := systemInfo.FreeMemory
+	systemSwapFreeMemory := systemInfo.FreeSwap
+	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
+
+	for _, gpu := range gpus {
+		available := gpu.FreeMemory - envconfig.GpuOverhead() - gpu.MinimumMemory()
+		if gpu.FreeMemory < envconfig.GpuOverhead()+gpu.MinimumMemory() {
+			available = 0
+		}
+		slog.Info("gpu memory", "id", gpu.ID, "library", gpu.Library,
+			"available", format.HumanBytes2(available),
+			"free", format.HumanBytes2(gpu.FreeMemory),
+			"minimum", format.HumanBytes2(gpu.MinimumMemory()),
+			"overhead", format.HumanBytes2(envconfig.GpuOverhead()))
+	}
+
 	gpuIDs, err := llama.Load(req.ctx, systemInfo, gpus, requireFull)
 	if err != nil {
 		if errors.Is(err, llm.ErrLoadRequiredFull) {