From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 23 Jul 2025 11:58:49 -0700 Subject: [PATCH] ggml: No-alloc mode Callers can set a scheduler to be no-alloc, meaning that it does not allocate memory for tensors or operations. This can be used for calculating memory requirements. Tensors and graphs must be recreated with no-alloc set to false before loading data. --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-impl.h | 16 +++ ggml/src/ggml-backend.cpp | 72 ++++++++++- ggml/src/ggml-cuda/common.cuh | 48 ++++++- ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------ 5 files changed, 310 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 2773cc310..ae94887dd 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -291,6 +291,7 @@ extern "C" { // Initialize a backend scheduler, backends with low index are given priority over backends with high index GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); + GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index c36c12d65..369e9e25a 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,12 +26,17 @@ extern "C" { size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false) bool (*is_host) (ggml_backend_buffer_type_t buft); + + // (optional) returns a dummy buffer that is equivalent to one created by alloc_buffer but without actually being backed + // by memory + ggml_backend_buffer_t (*noalloc_buffer)(ggml_backend_buffer_type_t buft, size_t size); }; struct ggml_backend_buffer_type { struct ggml_backend_buffer_type_i iface; ggml_backend_dev_t device; void * context; + bool no_alloc; }; // @@ -63,6 +68,7 @@ extern "C" { void * context; size_t size; enum ggml_backend_buffer_usage usage; + bool no_alloc; }; GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( @@ -114,6 +120,16 @@ extern "C" { void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); // wait for an event on on a different stream void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + + // (optional) reserves intermediate buffers needed for the compution + // if alloc is true, memory is actually allocated, otherwise the required amount is just returned by buffer_size + enum ggml_status (*graph_reserve) (ggml_backend_t backend, struct ggml_cgraph * cgraph, bool alloc); + + // (optional) returns the memory needed after calling graph_reserve + size_t (*buffer_size) (ggml_backend_t backend); + + // (optional) frees memory from intermediate buffers that was allocated either by graph_compute or graph_reserve + void (*reset) (ggml_backend_t backend); }; struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index d02a40e60..6b4dee4c7 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t return ggml_backend_buffer_init(buft, {}, NULL, 0); } + if (buft->no_alloc) { + ggml_backend_buffer_t buf; + + if (buft->iface.noalloc_buffer != NULL) { + buf = buft->iface.noalloc_buffer(buft, size); + } else { + buf = ggml_backend_buffer_init(buft, {}, NULL, size); + } + + buf->no_alloc = true; + return buf; + } + return buft->iface.alloc_buffer(buft, size); } @@ -89,7 +102,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, - /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY, + /* .no_alloc = */ false }; return buffer; @@ -119,6 +133,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return NULL; } + // If we aren't allocating memory, return a placeholder non-NULL pointer + // that meets alignment requirements + if (buffer->no_alloc) { + return (void *)ggml_backend_buffer_get_alignment(buffer); + } + void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); @@ -663,6 +683,12 @@ struct ggml_backend_sched { bool op_offload; int debug; + + // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation + // if false, dummy buffers are used for faster memory sizing calculations + // the scheduler needs to be recreated with allocated buffers before it can be used + // for computation + bool alloc_buffers; }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) @@ -1449,6 +1475,17 @@ ggml_backend_sched_t ggml_backend_sched_new( size_t graph_size, bool parallel, bool op_offload) { + return ggml_backend_sched_new_ext(backends, bufts, n_backends, graph_size, parallel, op_offload, true); + } + +ggml_backend_sched_t ggml_backend_sched_new_ext( + ggml_backend_t * backends, + ggml_backend_buffer_type_t * bufts, + int n_backends, + size_t graph_size, + bool parallel, + bool op_offload, + bool alloc_buffers) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); @@ -1490,10 +1527,13 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } + + sched->bufts[b]->no_alloc = !alloc_buffers; } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); @@ -1508,6 +1548,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); } + + if (sched->backends[b]->iface.reset != NULL) { + sched->backends[b]->iface.reset(sched->backends[b]); + } } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); @@ -1547,6 +1591,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return false; } + if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { + return false; + } + + struct ggml_backend_sched_split * splits = sched->splits; + for (int i = 0; i < sched->n_splits; i++) { + struct ggml_backend_sched_split * split = &splits[i]; + int split_backend_id = split->backend_id; + ggml_backend_t split_backend = sched->backends[split_backend_id]; + + if (split_backend->iface.graph_reserve != NULL) { + enum ggml_status ec = split_backend->iface.graph_reserve(split_backend, &split->graph, sched->alloc_buffers); + if (ec != GGML_STATUS_SUCCESS) { + return false; + } + } + } + ggml_backend_sched_reset(sched); return true; @@ -1635,7 +1697,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); + size_t size = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index); + + if (backend->iface.buffer_size != NULL) { + size += backend->iface.buffer_size(backend); + } + + return size; } void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 2e5d48797..b915ee1b8 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -35,6 +35,31 @@ #include "vendors/cuda.h" #endif // defined(GGML_USE_HIP) +extern bool reserving_graph; + +// If we are reserving the graph, pointers might be invalid and will fail if cudaMemcpyAsync tries to validate them. +// However, since we don't actually expect a result, we don't need to actually do the memcpy. +static cudaError_t cudaMemcpyAsyncReserve ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0 ) { + if (!reserving_graph) { + return cudaMemcpyAsync(dst, src, count, kind, stream); + } else { + return cudaSuccess; + } +} + +static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, cudaMemcpyKind kind, cudaStream_t stream = 0 ) { + if (!reserving_graph) { + return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); + } else { + return cudaSuccess; + } +} + +#undef cudaMemcpyAsync +#define cudaMemcpyAsync cudaMemcpyAsyncReserve +#undef cudaMemcpy2DAsync +#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve + #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) @@ -771,6 +796,9 @@ struct ggml_cuda_pool { virtual void * alloc(size_t size, size_t * actual_size) = 0; virtual void free(void * ptr, size_t size) = 0; + + virtual bool alloc_memory() = 0; + virtual size_t alloc_size() = 0; }; template @@ -914,11 +942,11 @@ struct ggml_backend_cuda_context { // pool std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; - static std::unique_ptr new_pool_for_device(int device); + static std::unique_ptr new_pool_for_device(int device, bool alloc); ggml_cuda_pool & pool(int device) { if (pools[device] == nullptr) { - pools[device] = new_pool_for_device(device); + pools[device] = new_pool_for_device(device, true); } return *pools[device]; } @@ -926,4 +954,20 @@ struct ggml_backend_cuda_context { ggml_cuda_pool & pool() { return pool(device); } + + void pool_set_alloc(bool alloc) { + GGML_ASSERT(pools[device] == nullptr || pools[device]->alloc_memory() == alloc); + + if (pools[device] == nullptr) { + pools[device] = new_pool_for_device(device, alloc); + } + } + + size_t pool_get_alloc_size() { + if (pools[device] == nullptr) { + return 0; + } + + return pools[device]->alloc_size(); + } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c7f9dc3a5..d5abe09e0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { // #define DEBUG_CUDA_MALLOC +#define CUDA_ALIGNMENT 128 + // buffer pool for cuda (legacy) struct ggml_cuda_pool_leg : public ggml_cuda_pool { static const int MAX_BUFFERS = 256; @@ -362,9 +364,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {}; size_t pool_size = 0; + bool allocate = true; + size_t last_alloc = 0; - explicit ggml_cuda_pool_leg(int device) : - device(device) { + explicit ggml_cuda_pool_leg(int device, bool alloc) : + device(device), + allocate(alloc) { } ~ggml_cuda_pool_leg() { @@ -372,7 +377,9 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { for (int i = 0; i < MAX_BUFFERS; ++i) { ggml_cuda_buffer & b = buffer_pool[i]; if (b.ptr != nullptr) { - CUDA_CHECK(cudaFree(b.ptr)); + if (allocate) { + CUDA_CHECK(cudaFree(b.ptr)); + } pool_size -= b.size; } } @@ -420,8 +427,15 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); look_ahead_size = 256 * ((look_ahead_size + 255)/256); - ggml_cuda_set_device(device); - CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device)); + if (allocate) { + ggml_cuda_set_device(device); + if (ggml_cuda_device_malloc(&ptr, look_ahead_size, device) != cudaSuccess) { + last_alloc = look_ahead_size; + throw std::bad_alloc(); + } + } else { + ptr = (void *)CUDA_ALIGNMENT; + } *actual_size = look_ahead_size; pool_size += look_ahead_size; #ifdef DEBUG_CUDA_MALLOC @@ -441,10 +455,20 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { } } GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); - ggml_cuda_set_device(device); - CUDA_CHECK(cudaFree(ptr)); + if (allocate) { + ggml_cuda_set_device(device); + CUDA_CHECK(cudaFree(ptr)); + } pool_size -= size; } + + bool alloc_memory() override { + return allocate; + } + + size_t alloc_size() override { + return pool_size + last_alloc; + } }; // pool with virtual memory @@ -456,18 +480,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { CUdeviceptr pool_addr = 0; size_t pool_used = 0; size_t pool_size = 0; + bool allocate = true; + size_t last_alloc = 0; size_t granularity; #if defined(GGML_USE_HIP) std::vector> mappings; #endif - explicit ggml_cuda_pool_vmm(int device) : + explicit ggml_cuda_pool_vmm(int device, bool alloc) : device(device), - granularity(ggml_cuda_info().devices[device].vmm_granularity) { + granularity(ggml_cuda_info().devices[device].vmm_granularity), + allocate(alloc) { + if (!allocate) { + pool_addr = (CUdeviceptr)CUDA_ALIGNMENT; + } } ~ggml_cuda_pool_vmm() { - if (pool_addr != 0) { + if (pool_addr != 0 && allocate) { #if defined(GGML_USE_HIP) // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285 for (std::pair & mapping : mappings) { @@ -494,35 +524,49 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); - // allocate more physical memory - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); - - // reserve virtual address space (if not already reserved) - if (pool_addr == 0) { - CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); - } + if (allocate) { + // allocate more physical memory + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + CUmemGenericAllocationHandle handle; + if (cuMemCreate(&handle, reserve_size, &prop, 0) != CUDA_SUCCESS) { + last_alloc = reserve_size; + throw std::bad_alloc(); + } - // map at the end of the pool - CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size); - CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0)); -#if defined(GGML_USE_HIP) - mappings.push_back({start_ptr, reserve_size}); -#endif + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + } - // the memory allocation handle is no longer needed after mapping - CU_CHECK(cuMemRelease(handle)); + // map at the end of the pool + CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size); + if (cuMemMap(start_ptr, reserve_size, 0, handle, 0) != CUDA_SUCCESS) { + last_alloc = reserve_size; + CU_CHECK(cuMemRelease(handle)); + throw std::bad_alloc(); + } + + // the memory allocation handle is no longer needed after mapping + CU_CHECK(cuMemRelease(handle)); + + // set access + CUmemAccessDesc access = {}; + access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access.location.id = device; + access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + if (cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1) != CUDA_SUCCESS) { + CU_CHECK(cuMemUnmap(start_ptr, reserve_size)); + last_alloc = reserve_size; + throw std::bad_alloc(); + } - // set access - CUmemAccessDesc access = {}; - access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access.location.id = device; - access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1)); + #if defined(GGML_USE_HIP) + mappings.push_back({start_ptr, reserve_size}); + #endif + } // add to the pool pool_size += reserve_size; @@ -555,16 +599,24 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { // all deallocations must be in reverse order of the allocations GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used)); } + + bool alloc_memory() override { + return allocate; + } + + size_t alloc_size() override { + return pool_size + last_alloc; + } }; #endif // defined(GGML_USE_VMM) -std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { +std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device, bool alloc) { #if defined(GGML_USE_VMM) if (ggml_cuda_info().devices[device].vmm) { - return std::unique_ptr(new ggml_cuda_pool_vmm(device)); + return std::unique_ptr(new ggml_cuda_pool_vmm(device, alloc)); } #endif // defined(GGML_USE_VMM) - return std::unique_ptr(new ggml_cuda_pool_leg(device)); + return std::unique_ptr(new ggml_cuda_pool_leg(device, alloc)); } // destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error @@ -748,11 +800,20 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac } static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; + return CUDA_ALIGNMENT; GGML_UNUSED(buft); } +static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_noalloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + void * dev_ptr = (void *)ggml_backend_cuda_buffer_type_get_alignment(buft); + ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); + + return ggml_backend_buffer_init(buft, {}, ctx, size); +} + static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -776,6 +837,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .is_host = */ NULL, + /* .noalloc_buffer = */ ggml_backend_cuda_buffer_type_noalloc_buffer, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -2936,6 +2998,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { + // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; @@ -2951,6 +3014,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + // When reserving, we are forcing CUDA graphs but this operation is not graph-safe so we need to skip it + if (reserving_graph && node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) { + continue; + } + static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); if (!disable_fusion) { if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) { @@ -3022,6 +3090,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + cuda_ctx->pool_set_alloc(true); ggml_cuda_set_device(cuda_ctx->device); @@ -3101,6 +3170,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; } +// This is used to skip operations that are not graph safe during the reservation process. +bool reserving_graph = false; + +static enum ggml_status ggml_backend_cuda_graph_reserve(ggml_backend_t backend, ggml_cgraph * cgraph, bool alloc) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + cuda_ctx->pool_set_alloc(alloc); + + #ifdef USE_CUDA_GRAPH + if (cuda_ctx->cuda_graph == nullptr) { + cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); + } + #endif + + ggml_cuda_set_device(cuda_ctx->device); + + { + std::lock_guard lock(ggml_cuda_lock); + ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed); + } + + reserving_graph = true; + + // Create CuBLAS handles early to avoid synchronous allocations during graph capture. + cuda_ctx->cublas_handle(); + + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); + + enum ggml_status result = GGML_STATUS_SUCCESS; + + try { + bool use_cuda_graph = false; + bool cuda_graph_update_required = false; + bool graph_evaluated_or_captured = false; + + evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required); + } catch (const std::exception &e) { + result = GGML_STATUS_FAILED; + } + + cudaGraph_t graph; + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph)); + CUDA_CHECK(cudaGraphDestroy(graph)); + + reserving_graph = false; + + { + std::lock_guard lock(ggml_cuda_lock); + if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) { + ggml_cuda_lock_cv.notify_all(); + } + } + + return result; +} + +static size_t ggml_backend_cuda_buffer_size(ggml_backend_t backend) { + ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; + return ctx->pool_get_alloc_size(); +} + +static void ggml_backend_cuda_reset(ggml_backend_t backend) { + ggml_backend_cuda_context * ctx = (ggml_backend_cuda_context *)backend->context; + ctx->pools[ctx->device] = NULL; +} + static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -3140,6 +3274,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, + /* .graph_reserve = */ ggml_backend_cuda_graph_reserve, + /* .buffer_size = */ ggml_backend_cuda_buffer_size, + /* .reset = */ ggml_backend_cuda_reset, }; static ggml_guid_t ggml_backend_cuda_guid() {