diff --git a/llm/memory.go b/llm/memory.go index 384e2dc60..766e9e444 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -182,7 +182,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, gzo = gpuZeroOverhead } // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer - if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { + if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { slog.Debug("gpu has too little memory to allocate any layers", "id", gpus[i].ID, "library", gpus[i].Library, @@ -228,7 +228,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[i%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if (g.g.FreeMemory - overhead) > used+layerSize { + if g.g.FreeMemory > overhead+used+layerSize { gpuAllocations[g.i] += layerSize layerCounts[g.i]++ layerCount++ @@ -251,7 +251,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[layerCount%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if (g.g.FreeMemory - overhead) > used+memoryLayerOutput { + if g.g.FreeMemory > overhead+used+memoryLayerOutput { gpuAllocations[g.i] += memoryLayerOutput layerCounts[g.i]++ layerCount++