ggml: Remove allocation status reporting

For each memory allocation we report the size of the (attempted) allocation and whether it succeeded or failed. The latter status reporting proved to be not that useful in practice as systems such as Windows can automatically overflow from VRAM into RAM, resultings in successful allocations even when there isn't enough memory where we wanted. As a result, this information is only used for debug logging, which isn't worthwhile enough for the amount of code. It also isn't fully accurate, as multiple allocations may result in partial failures.
2025-11-12 02:37:49 +01:00 · 2025-09-22 17:27:03 -07:00
parent 0469861d9d
commit 734b57da0e
9 changed files with 86 additions and 201 deletions
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -155,18 +155,18 @@ func TestLLMServerFitGPU(t *testing.T) {
 			}

 			s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{
-				Weights: make([]ml.Memory, s.totalLayers),
-				Cache:   make([]ml.Memory, s.totalLayers),
+				Weights: make([]uint64, s.totalLayers),
+				Cache:   make([]uint64, s.totalLayers),
 			}, GPUs: make([]ml.DeviceMemory, len(gpus))}

 			for i := range tt.layers {
-				s.mem.CPU.Weights[i].Size = uint64(tt.layers[i])
+				s.mem.CPU.Weights[i] = uint64(tt.layers[i])
 			}

 			for i := range s.mem.GPUs {
 				s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i)
-				s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers)
-				s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers)
+				s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers)
+				s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers)
 			}

 			gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0)