diff --git a/llama/patches/0014-graph-memory-reporting-on-failure.patch b/llama/patches/0014-graph-memory-reporting-on-failure.patch index 26fe8a8e03..a9fc420f19 100644 --- a/llama/patches/0014-graph-memory-reporting-on-failure.patch +++ b/llama/patches/0014-graph-memory-reporting-on-failure.patch @@ -4,48 +4,38 @@ Date: Fri, 18 Apr 2025 15:58:19 -0700 Subject: [PATCH] graph memory reporting on failure --- - ggml/include/ggml-alloc.h | 6 ++++++ - ggml/include/ggml-backend.h | 6 ++++++ - ggml/src/ggml-alloc.c | 38 +++++++++++++++++++++++++++++++++---- - ggml/src/ggml-backend.cpp | 10 ++++++++++ - 4 files changed, 56 insertions(+), 4 deletions(-) + ggml/include/ggml-alloc.h | 1 + + ggml/include/ggml-backend.h | 1 + + ggml/src/ggml-alloc.c | 36 ++++++++++++++++++++++++++++++++---- + ggml/src/ggml-backend.cpp | 7 +++++++ + 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h -index 2cb150fd..781b1e10 100644 +index 2cb150fd2..7ab3f0192 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h -@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph +@@ -65,6 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( + GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); ++GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id); -+struct ggml_allocr_buffer_status { -+ size_t size; -+ bool allocated; -+}; -+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id); -+ // Utils // Create a buffer and allocate all the tensors in a ggml_context - GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index a2977ea2..8a91b381 100644 +index a2977ea2e..e8cf30841 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h -@@ -304,6 +304,12 @@ extern "C" { +@@ -303,6 +303,7 @@ extern "C" { + GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); ++ GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); -+ struct ggml_backend_buffer_status { -+ size_t size; -+ bool allocated; -+ }; -+ GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); -+ GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); - diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c -index 8b6e6028..41c8c4a2 100644 +index 8b6e60283..b58bd671d 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -350,6 +350,7 @@ struct node_alloc { @@ -108,11 +98,11 @@ index 8b6e6028..41c8c4a2 100644 } bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { -@@ -920,6 +932,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { +@@ -920,6 +932,22 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } -+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { ++size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { + GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); + + for (int i = 0; i < buffer_id; i++) { @@ -121,34 +111,29 @@ index 8b6e6028..41c8c4a2 100644 + // (See above.) However, we need a different check because multiple buffers might be NULL in our + // case and we still want to know the attempted size. + -+ struct ggml_allocr_buffer_status status = {0, true}; -+ return status; ++ return 0; + } + } + -+ struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL}; -+ return status; ++ return galloc->buffer_sizes[buffer_id]; +} + // utils static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index 97f47abd..eded0291 100644 +index 97f47abd2..d02a40e60 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp -@@ -1631,6 +1631,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe +@@ -1631,6 +1631,13 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } -+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { ++size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { + int backend_index = ggml_backend_sched_backend_id(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); + -+ struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); -+ struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated}; -+ -+ return status; ++ return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); +} + void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/llm/server.go b/llm/server.go index 75f049bc05..5a435c9f8e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -853,19 +853,19 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d if memory == nil { memory = &ml.BackendMemory{CPU: ml.DeviceMemory{ - Weights: make([]ml.Memory, s.totalLayers), - Cache: make([]ml.Memory, s.totalLayers), + Weights: make([]uint64, s.totalLayers), + Cache: make([]uint64, s.totalLayers), }} } layers := make([]uint64, len(memory.CPU.Weights)) for i := range layers { for j := range memory.GPUs { - layers[i] += memory.GPUs[j].Weights[i].Size - layers[i] += memory.GPUs[j].Cache[i].Size + layers[i] += memory.GPUs[j].Weights[i] + layers[i] += memory.GPUs[j].Cache[i] } - layers[i] += memory.CPU.Weights[i].Size - layers[i] += memory.CPU.Cache[i].Size + layers[i] += memory.CPU.Weights[i] + layers[i] += memory.CPU.Cache[i] logutil.Trace("layer to assign", "layer", i, "size", format.HumanBytes2(layers[i])) } @@ -880,11 +880,11 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d found := false for j := range memory.GPUs { if gl[i].ID == memory.GPUs[j].ID { - if memory.GPUs[j].Graph.Size != 0 { + if memory.GPUs[j].Graph != 0 { lastUsedGPU = i } - reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph.Size + reserved := uint64(float32(gl[i].FreeMemory)*backoff) + gl[i].MinimumMemory + envconfig.GpuOverhead() + memory.GPUs[j].Graph if gl[i].FreeMemory > reserved { gl[i].FreeMemory -= reserved } else { @@ -895,7 +895,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d "available layer vram", format.HumanBytes2(gl[i].FreeMemory), "backoff", fmt.Sprintf("%.2f", backoff), "minimum", format.HumanBytes2(gl[i].MinimumMemory), "overhead", format.HumanBytes2(envconfig.GpuOverhead()), - "graph", format.HumanBytes2(memory.GPUs[j].Graph.Size)) + "graph", format.HumanBytes2(memory.GPUs[j].Graph)) found = true break @@ -914,12 +914,12 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d } // These sizes will only increase as we go through additional iterations and get additional information. - cpuSize := memory.InputWeights.Size + memory.CPU.Graph.Size + cpuSize := memory.InputWeights + memory.CPU.Graph var vramSize uint64 for _, gl := range gpuLayers { for _, gpu := range memory.GPUs { if gl.ID == gpu.ID { - vramSize += gpu.Graph.Size + vramSize += gpu.Graph break } } @@ -1723,21 +1723,21 @@ func (s *ollamaServer) VRAMSize() uint64 { var mem uint64 for _, g := range s.mem.GPUs { - mem += g.Allocated() + mem += g.Size() } // Some elements are always on CPU. However, if we have allocated all layers // on the GPU then include the CPU components as well, to represent complete offloading. noCPULayers := true for i := range s.mem.CPU.Weights { - if s.mem.CPU.Weights[i].Size != 0 || s.mem.CPU.Cache[i].Size != 0 { + if s.mem.CPU.Weights[i] != 0 || s.mem.CPU.Cache[i] != 0 { noCPULayers = false break } } if noCPULayers { - mem += s.mem.InputWeights.Size - mem += s.mem.CPU.Graph.Size + mem += s.mem.InputWeights + mem += s.mem.CPU.Graph } return mem @@ -1748,10 +1748,10 @@ func (s *ollamaServer) TotalSize() uint64 { return 0 } - mem := s.mem.InputWeights.Size - mem += s.mem.CPU.Allocated() + mem := s.mem.InputWeights + mem += s.mem.CPU.Size() for _, g := range s.mem.GPUs { - mem += g.Allocated() + mem += g.Size() } return mem @@ -1764,7 +1764,7 @@ func (s *ollamaServer) VRAMByGPU(gpuID string) uint64 { for _, g := range s.mem.GPUs { if g.ID == gpuID { - return g.Allocated() + return g.Size() } } diff --git a/llm/server_test.go b/llm/server_test.go index 4eed82bce3..db143a829d 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -155,18 +155,18 @@ func TestLLMServerFitGPU(t *testing.T) { } s.mem = &ml.BackendMemory{CPU: ml.DeviceMemory{ - Weights: make([]ml.Memory, s.totalLayers), - Cache: make([]ml.Memory, s.totalLayers), + Weights: make([]uint64, s.totalLayers), + Cache: make([]uint64, s.totalLayers), }, GPUs: make([]ml.DeviceMemory, len(gpus))} for i := range tt.layers { - s.mem.CPU.Weights[i].Size = uint64(tt.layers[i]) + s.mem.CPU.Weights[i] = uint64(tt.layers[i]) } for i := range s.mem.GPUs { s.mem.GPUs[i].ID = fmt.Sprintf("gpu%d", i) - s.mem.GPUs[i].Weights = make([]ml.Memory, s.totalLayers) - s.mem.GPUs[i].Cache = make([]ml.Memory, s.totalLayers) + s.mem.GPUs[i].Weights = make([]uint64, s.totalLayers) + s.mem.GPUs[i].Cache = make([]uint64, s.totalLayers) } gpuLayers, err := s.createLayout(systemInfo, gpus, s.mem, tt.requireFull, 0) diff --git a/ml/backend.go b/ml/backend.go index 455715b0d1..de3d7ec039 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -158,40 +158,6 @@ func (e ErrNoMem) Error() string { return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory) } -type AllocationStatus int - -const ( - // Unallocated memory - have not yet attempted to allocate - Unallocated AllocationStatus = iota - - // Failed memory - tried to allocate the memory and did not succeed - Failed - - // Allocated memory = tried and succeeded to allocate memory - Allocated -) - -// Memory is the size of an allocation and whether it was successful. -type Memory struct { - Size uint64 - Status AllocationStatus -} - -func (m Memory) String() string { - s := fmt.Sprint(m.Size) - - switch m.Status { - case Unallocated: - s += "U" - case Failed: - s += "F" - case Allocated: - s += "A" - } - - return s -} - // DeviceMemory provides a breakdown of the memory needed // per device, such as a CPU or GPU. type DeviceMemory struct { @@ -204,39 +170,32 @@ type DeviceMemory struct { ID string // Weights is the per-layer memory needed for the model weights. - Weights []Memory + Weights []uint64 // Cache is the per-layer memory needed for the KV cache. - Cache []Memory + Cache []uint64 // Graph is the size of the compute graph. It is not per-layer. - Graph Memory + Graph uint64 } -// Allocated returns the total size of the memory that has been successfully -// allocated on this device -func (m DeviceMemory) Allocated() uint64 { - var mem uint64 +func sumMemory(mem []uint64) uint64 { + var sum uint64 - for _, w := range m.Weights { - if w.Status == Allocated { - mem += w.Size - } - } - for _, c := range m.Cache { - if c.Status == Allocated { - mem += c.Size - } - } - if m.Graph.Status == Allocated { - mem += m.Graph.Size + for _, m := range mem { + sum += m } - return mem + return sum } -func memoryPresent(mem []Memory) bool { - return slices.ContainsFunc(mem, func(m Memory) bool { return m.Size != 0 }) +// Size returns the total size of the memory required by this device +func (m DeviceMemory) Size() uint64 { + return sumMemory(m.Weights) + sumMemory(m.Cache) + m.Graph +} + +func memoryPresent(mem []uint64) bool { + return slices.ContainsFunc(mem, func(m uint64) bool { return m != 0 }) } func (m DeviceMemory) LogValue() slog.Value { @@ -249,7 +208,7 @@ func (m DeviceMemory) LogValue() slog.Value { attrs = append(attrs, slog.Any("Cache", m.Cache)) } - if m.Graph.Size != 0 { + if m.Graph != 0 { attrs = append(attrs, slog.Any("Graph", m.Graph)) } @@ -267,7 +226,7 @@ func (m DeviceMemory) LogValue() slog.Value { // accommodate that to make forward progress. type BackendMemory struct { // InputWeights are always located on the CPU and cannot be moved - InputWeights Memory + InputWeights uint64 // CPU model components are located in system memory. This does not // include unified memory allocated through the GPU. @@ -279,7 +238,7 @@ type BackendMemory struct { func (m BackendMemory) LogValue() slog.Value { var attrs []slog.Attr - if m.InputWeights.Size != 0 { + if m.InputWeights != 0 { attrs = append(attrs, slog.Any("InputWeights", m.InputWeights)) } @@ -291,17 +250,7 @@ func (m BackendMemory) LogValue() slog.Value { return slog.GroupValue(attrs...) } -func sumMemory(mem []Memory) uint64 { - var sum uint64 - - for _, m := range mem { - sum += m.Size - } - - return sum -} - -// Log prints a high level summary of the memory (allocated or not) +// Log prints a high level summary of the memory func (m BackendMemory) Log(level slog.Level) { var total uint64 @@ -311,7 +260,7 @@ func (m BackendMemory) Log(level slog.Level) { total += sum } } - if sum := m.InputWeights.Size + sumMemory(m.CPU.Weights); sum > 0 { + if sum := m.InputWeights + sumMemory(m.CPU.Weights); sum > 0 { slog.Log(context.TODO(), level, "model weights", "device", m.CPU.Name, "size", format.HumanBytes2(sum)) total += sum } @@ -328,12 +277,12 @@ func (m BackendMemory) Log(level slog.Level) { } for _, gpu := range m.GPUs { - if sum := gpu.Graph.Size; sum > 0 { + if sum := gpu.Graph; sum > 0 { slog.Log(context.TODO(), level, "compute graph", "device", gpu.Name, "size", format.HumanBytes2(sum)) total += sum } } - if sum := m.CPU.Graph.Size; sum > 0 { + if sum := m.CPU.Graph; sum > 0 { slog.Log(context.TODO(), level, "compute graph", "device", m.CPU.Name, "size", format.HumanBytes2(sum)) total += sum } diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 49dc3e1ab2..18b48117df 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -169,8 +169,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { var props C.struct_ggml_backend_dev_props C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props) requiredMemory.CPU.ID = C.GoString(props.id) - requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1) - requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1) + requiredMemory.CPU.Weights = make([]uint64, blocks+1) + requiredMemory.CPU.Cache = make([]uint64, blocks+1) // create list of buffer types for each gpu var gpuDeviceBufferTypes []deviceBufferType @@ -188,8 +188,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { var props C.struct_ggml_backend_dev_props C.ggml_backend_dev_get_props(d, &props) requiredMemory.GPUs[i].ID = C.GoString(props.id) - requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1) - requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1) + requiredMemory.GPUs[i].Weights = make([]uint64, blocks+1) + requiredMemory.GPUs[i].Cache = make([]uint64, blocks+1) } // inputs always use cpu @@ -275,13 +275,9 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt)) if layer == -1 { - // Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case - if params.AllocMemory { - requiredMemory.InputWeights.Status = ml.Allocated - } - requiredMemory.InputWeights.Size += uint64(size) + requiredMemory.InputWeights += uint64(size) } else { - btDeviceMemory[bt].Weights[layer].Size += uint64(size) + btDeviceMemory[bt].Weights[layer] += uint64(size) } //nolint:staticcheck // TODO: check if buffer type supports this tensor @@ -349,18 +345,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { } b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt) - if params.AllocMemory { - for i := range btDeviceMemory[bt].Weights { - if btDeviceMemory[bt].Weights[i].Size != 0 { - if b != nil { - btDeviceMemory[bt].Weights[i].Status = ml.Allocated - } else { - btDeviceMemory[bt].Weights[i].Status = ml.Failed - } - } - } - } - if b == nil { for _, b := range bbs { C.ggml_backend_buffer_free(b) @@ -795,24 +779,15 @@ func (c *Context) Reserve() { // Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations for _, bt := range c.b.schedBufts { - c.b.btDeviceMemory[bt].Graph = ml.Memory{} + c.b.btDeviceMemory[bt].Graph = 0 } for i := range c.b.schedBackends { - bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i]) - - graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph - graph.Size += uint64(bufferStatus.size) - if c.b.allocMemory { - if bufferStatus.allocated && graph.Status != ml.Failed { - graph.Status = ml.Allocated - } else { - graph.Status = ml.Failed - } - } + bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i]) + c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize) logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), - "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferStatus.size))) + "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize))) } if !reserved { @@ -862,16 +837,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor { b := C.ggml_backend_buft_alloc_buffer(c.buft, size) if c.layer >= 0 { - cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer] - - cache.Size += uint64(size) - if c.b.allocMemory { - if b != nil { - cache.Status = ml.Allocated - } else { - cache.Status = ml.Failed - } - } + c.b.btDeviceMemory[c.buft].Cache[c.layer] += uint64(size) } if b == nil { diff --git a/ml/backend/ggml/ggml/include/ggml-alloc.h b/ml/backend/ggml/ggml/include/ggml-alloc.h index 781b1e100c..7ab3f0192a 100644 --- a/ml/backend/ggml/ggml/include/ggml-alloc.h +++ b/ml/backend/ggml/ggml/include/ggml-alloc.h @@ -65,12 +65,7 @@ GGML_API bool ggml_gallocr_reserve_n( GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); - -struct ggml_allocr_buffer_status { - size_t size; - bool allocated; -}; -GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id); +GGML_API size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id); // Utils // Create a buffer and allocate all the tensors in a ggml_context diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index fda5ceb249..7989a5e27e 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -306,12 +306,7 @@ extern "C" { GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); - - struct ggml_backend_buffer_status { - size_t size; - bool allocated; - }; - GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); diff --git a/ml/backend/ggml/ggml/src/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c index 41c8c4a2f4..b58bd671d5 100644 --- a/ml/backend/ggml/ggml/src/ggml-alloc.c +++ b/ml/backend/ggml/ggml/src/ggml-alloc.c @@ -932,7 +932,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } -struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { +size_t ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) { GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); for (int i = 0; i < buffer_id; i++) { @@ -941,13 +941,11 @@ struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gal // (See above.) However, we need a different check because multiple buffers might be NULL in our // case and we still want to know the attempted size. - struct ggml_allocr_buffer_status status = {0, true}; - return status; + return 0; } } - struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL}; - return status; + return galloc->buffer_sizes[buffer_id]; } // utils diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 6556943b02..6c12e40f8f 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -1656,14 +1656,11 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } -struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { +size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); - struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated}; - - return status; + return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); } void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {