ggml: Free ggml_backend_buffer_t when releasing buffer

When ggml_backend_buffer_free() is called, the device memory
is released but not all backends consistently release the actual
ggml_backend_buffer_t in system RAM, causing a memory leak.

Bug #10040
This commit is contained in:
Jesse Gross
2025-04-14 12:12:36 -07:00
committed by Jesse Gross
parent 0fe487e732
commit ccb7eb8135
6 changed files with 224 additions and 53 deletions

View File

@@ -541,6 +541,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
free(ctx->buffers);
free(ctx);
delete buffer;
}
static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1864,6 +1865,11 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_aligned_free(buffer->context, buffer->size);
delete buffer;
}
static void ggml_backend_cpu_ptr_buffer_free_buffer(ggml_backend_buffer_t buffer) {
delete buffer;
}
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -1911,7 +1917,7 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
};
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .free_buffer = */ ggml_backend_cpu_ptr_buffer_free_buffer, // ptr is not owned by the buffer but need to free the buffer itself
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,

View File

@@ -784,6 +784,7 @@ struct ggml_backend_cuda_split_buffer_context {
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
delete ctx;
delete buffer;
}
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
@@ -1056,6 +1057,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
delete buffer;
}
static void * ggml_cuda_host_malloc(size_t size) {