diff --git a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch deleted file mode 100644 index 85cba5b3a7..0000000000 --- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch +++ /dev/null @@ -1,58 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Oliver Simons -Date: Tue, 22 Jul 2025 11:02:28 +0200 -Subject: [PATCH] Enable CUDA Graphs for gemma3n. - -Similar to -https://github.com/ggml-org/llama.cpp/pull/14741, -though ollama has a slightly different model graph -than llama.cpp which requires different workaround -checks. ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 18 ++++++++++++++++++ - 1 file changed, 18 insertions(+) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 5b852f69..827e3205 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2689,14 +2689,26 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - -+ // This fix was added in llama.cpp and Ollama in parallel, but with -+ // different tensor names. -+ // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 -+ // ollama: https://github.com/ollama/ollama/pull/11525 -+ -+ const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; -+ const std::string gemma3n_node_name_ollama = "node_"; -+ - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; - const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; -+ -+ const std::string ffn_moe_bias_suffix = "_exps.bias"; -+ - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; - const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; - const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; - const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; - const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - -+ - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - -@@ -2720,6 +2732,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && -+ // ollama -+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n -+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here -+ !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && -+ node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && -+ // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && diff --git a/llama/patches/0019-ggml-Add-batch-size-hint.patch b/llama/patches/0019-ggml-Add-batch-size-hint.patch new file mode 100644 index 0000000000..76d61e2d6f --- /dev/null +++ b/llama/patches/0019-ggml-Add-batch-size-hint.patch @@ -0,0 +1,300 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Jesse Gross +Date: Tue, 28 Oct 2025 17:36:54 -0700 +Subject: [PATCH] ggml: Add batch size hint + +Some operations use heuristics to determine the batch size, which +affects offloading decisions. However, these are not always +accurate when looking at single operations. This provides an +explicit signal on the batch size from higher layers to ensure +consistent performance. +--- + ggml/include/ggml-backend.h | 5 ++- + ggml/src/ggml-backend-impl.h | 4 +-- + ggml/src/ggml-backend.cpp | 19 +++++++---- + ggml/src/ggml-blas/ggml-blas.cpp | 3 +- + ggml/src/ggml-cpu/ggml-cpu.cpp | 4 ++- + ggml/src/ggml-cuda/ggml-cuda.cu | 48 +++++++++++++++++----------- + ggml/src/ggml-metal/ggml-metal.cpp | 4 ++- + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +- + 8 files changed, 58 insertions(+), 32 deletions(-) + +diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h +index 229bf387b..2763f2bd6 100644 +--- a/ggml/include/ggml-backend.h ++++ b/ggml/include/ggml-backend.h +@@ -98,7 +98,7 @@ extern "C" { + + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); +- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // NOTE: will be removed, use device version instead + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); +@@ -307,6 +307,9 @@ extern "C" { + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + ++ // Provide a hint on the batch size to optimize processing (uses heuristics if unset) ++ GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); ++ + // Initialize backend buffers from a measure graph + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success + +diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h +index 6792ba986..0f5b03cef 100644 +--- a/ggml/src/ggml-backend-impl.h ++++ b/ggml/src/ggml-backend-impl.h +@@ -106,8 +106,8 @@ extern "C" { + // compute the graph with the plan + enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + +- // compute graph (always async if supported by the backend) +- enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); ++ // compute graph (always async if supported by the backend). batch_size may be -1 if unknown ++ enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); + + // (optional) event synchronization + // record an event on this stream +diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp +index cb2b99562..41eef3b5f 100644 +--- a/ggml/src/ggml-backend.cpp ++++ b/ggml/src/ggml-backend.cpp +@@ -348,14 +348,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba + } + + enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); ++ enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); + ggml_backend_synchronize(backend); + return err; + } + +-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + GGML_ASSERT(backend); +- return backend->iface.graph_compute(backend, cgraph); ++ return backend->iface.graph_compute(backend, cgraph, batch_size); + } + + bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +@@ -722,6 +722,8 @@ struct ggml_backend_sched { + + bool op_offload; + ++ int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics ++ + int debug; + }; + +@@ -814,7 +816,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st + if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); + // check if a backend with higher prio wants to offload the op +- if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { ++ if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); +@@ -1550,7 +1552,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + } + + if (!sched->callback_eval) { +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1572,7 +1574,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s + + struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); + +- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); ++ enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); + if (ec != GGML_STATUS_SUCCESS) { + return ec; + } +@@ -1651,6 +1653,7 @@ ggml_backend_sched_t ggml_backend_sched_new( + + sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->op_offload = op_offload; ++ sched->batch_size = -1; + + ggml_backend_sched_reset(sched); + +@@ -1682,6 +1685,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { + free(sched); + } + ++void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { ++ sched->batch_size = batch_size; ++} ++ + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { + GGML_ASSERT(sched); + // reset state for the next run +diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp +index 5b888cdd8..88d088952 100644 +--- a/ggml/src/ggml-blas/ggml-blas.cpp ++++ b/ggml/src/ggml-blas/ggml-blas.cpp +@@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { + delete backend; + } + +-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { +@@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); ++ GGML_UNUSED(batch_size); + } + + static struct ggml_backend_i blas_backend_i = { +diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp +index 3191faaa4..32f14c811 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.cpp ++++ b/ggml/src/ggml-cpu/ggml-cpu.cpp +@@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe + GGML_UNUSED(backend); + } + +-static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); +@@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + + return ggml_graph_compute(cgraph, &cplan); ++ ++ GGML_UNUSED(batch_size); + } + + static const struct ggml_backend_i ggml_backend_cpu_i = { +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 5b852f690..c555cd30f 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2684,7 +2684,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + + #ifdef USE_CUDA_GRAPH + static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, +- bool use_cuda_graph) { ++ int batch_size, bool use_cuda_graph) { + + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); +@@ -2718,24 +2718,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && +- node->src[1] && node->src[1]->ne[1] > 1 && +- (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && +- (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && +- strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && +- strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && +- strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && +- strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { +- // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation +- // by means of matching node names. See +- // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and +- // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, +- // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. +- use_cuda_graph = false; ++ // If we have an explicit batch size hint then we don't need to use the tensor name heuristics ++ if (batch_size >= 0) { ++ if (batch_size > 1) { ++ use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); + #endif ++ } ++ } else { ++ if (node->op == GGML_OP_ADD && ++ node->src[1] && node->src[1]->ne[1] > 1 && ++ (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && ++ (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && ++ strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && ++ strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && ++ strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && ++ strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { ++ // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation ++ // by means of matching node names. See ++ // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and ++ // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. ++ use_cuda_graph = false; ++#ifndef NDEBUG ++ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++#endif ++ } + } + + if (node->op == GGML_OP_CPY) { +@@ -3132,7 +3142,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx + } + } + +-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + ggml_cuda_set_device(cuda_ctx->device); +@@ -3170,7 +3180,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, + if (use_cuda_graph) { + cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); + +- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); ++ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); + + // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. + if (use_cuda_graph && cuda_graph_update_required) { +diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp +index f2ff9f322..05ff6a5a6 100644 +--- a/ggml/src/ggml-metal/ggml-metal.cpp ++++ b/ggml/src/ggml-metal/ggml-metal.cpp +@@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml + GGML_UNUSED(dst); + } + +-static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + ggml_metal_t ctx = (ggml_metal_t)backend->context; + + return ggml_metal_graph_compute(ctx, cgraph); ++ ++ GGML_UNUSED(batch_size); + } + + static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { +diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +index ed83236f4..bd3ece516 100644 +--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp ++++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp +@@ -12015,7 +12015,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru + return num_adds; + } + +-static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ++static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { + VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + +@@ -12211,6 +12211,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg + return GGML_STATUS_SUCCESS; + + UNUSED(backend); ++ UNUSED(batch_size); + } + + // Sort the graph for improved parallelism. diff --git a/llama/patches/0022-ggml-No-alloc-mode.patch b/llama/patches/0022-ggml-No-alloc-mode.patch index 019cb8869b..d03c6c8480 100644 --- a/llama/patches/0022-ggml-No-alloc-mode.patch +++ b/llama/patches/0022-ggml-No-alloc-mode.patch @@ -16,7 +16,7 @@ must be recreated with no-alloc set to false before loading data. 5 files changed, 310 insertions(+), 44 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h -index 229bf387..1ff53ed0 100644 +index 2763f2bd6..b3b5b356a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -305,6 +305,7 @@ extern "C" { @@ -26,9 +26,9 @@ index 229bf387..1ff53ed0 100644 + GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - // Initialize backend buffers from a measure graph + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h -index 6792ba98..3c3f22fc 100644 +index 0f5b03cef..7bdf9d81f 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -26,12 +26,17 @@ extern "C" { @@ -75,7 +75,7 @@ index 6792ba98..3c3f22fc 100644 struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index cb2b9956..6ef5eeaf 100644 +index 41eef3b5f..c81a2e48a 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -41,6 +41,19 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t @@ -121,8 +121,8 @@ index cb2b9956..6ef5eeaf 100644 void * base = buffer->iface.get_base(buffer); GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); -@@ -723,6 +743,12 @@ struct ggml_backend_sched { - bool op_offload; +@@ -725,6 +745,12 @@ struct ggml_backend_sched { + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics int debug; + @@ -134,7 +134,7 @@ index cb2b9956..6ef5eeaf 100644 }; #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) -@@ -1606,6 +1632,17 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1608,6 +1634,17 @@ ggml_backend_sched_t ggml_backend_sched_new( size_t graph_size, bool parallel, bool op_offload) { @@ -152,7 +152,7 @@ index cb2b9956..6ef5eeaf 100644 GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); -@@ -1647,10 +1684,13 @@ ggml_backend_sched_t ggml_backend_sched_new( +@@ -1649,11 +1686,14 @@ ggml_backend_sched_t ggml_backend_sched_new( sched->events[b][c] = ggml_backend_event_new(backends[b]->device); } } @@ -162,11 +162,12 @@ index cb2b9956..6ef5eeaf 100644 sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; + sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); -@@ -1665,6 +1705,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { +@@ -1668,6 +1708,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { for (int c = 0; c < sched->n_copies; c++) { ggml_backend_event_free(sched->events[b][c]); } @@ -177,7 +178,7 @@ index cb2b9956..6ef5eeaf 100644 } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); -@@ -1708,6 +1752,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * +@@ -1715,6 +1759,24 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * return false; } @@ -202,7 +203,7 @@ index cb2b9956..6ef5eeaf 100644 ggml_backend_sched_reset(sched); return true; -@@ -1813,7 +1875,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, +@@ -1820,7 +1882,13 @@ size_t ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); @@ -218,7 +219,7 @@ index cb2b9956..6ef5eeaf 100644 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh -index e0abde54..28d6bcd7 100644 +index e0abde542..28d6bcd71 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -35,6 +35,31 @@ @@ -299,7 +300,7 @@ index e0abde54..28d6bcd7 100644 + } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 827e3205..811462c7 100644 +index f4d4a4267..ac70dcac8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() { @@ -540,7 +541,7 @@ index 827e3205..811462c7 100644 }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { -@@ -3011,6 +3073,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, +@@ -3003,6 +3065,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { @@ -548,7 +549,7 @@ index 827e3205..811462c7 100644 // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; -@@ -3026,6 +3089,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3018,6 +3081,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } @@ -560,15 +561,15 @@ index 827e3205..811462c7 100644 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); if (!disable_fusion) { -@@ -3152,6 +3220,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx +@@ -3144,6 +3212,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx - static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + cuda_ctx->pool_set_alloc(true); ggml_cuda_set_device(cuda_ctx->device); -@@ -3231,6 +3300,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, +@@ -3223,6 +3292,71 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; } @@ -640,7 +641,7 @@ index 827e3205..811462c7 100644 static void ggml_backend_cuda_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -@@ -3271,6 +3405,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { +@@ -3263,6 +3397,9 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, /* .graph_optimize = */ NULL, diff --git a/ml/backend.go b/ml/backend.go index bf390c0121..b07039e217 100644 --- a/ml/backend.go +++ b/ml/backend.go @@ -106,6 +106,11 @@ type Context interface { Arange(start, stop, step float32, dtype DType) Tensor Forward(...Tensor) Context + + // SetBatchSize provides a hint on the batch size to optimize processing + // Uses heuristics if not set + SetBatchSize(int) + Compute(...Tensor) ComputeWithNotify(func(), ...Tensor) // notify callback once compute has begun diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index 8c782d7340..eb02c3b12c 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -386,7 +386,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) { C.int(len(schedBackends)), C.size_t(maxGraphNodes), C._Bool(false), - C._Bool(false), + C._Bool(true), C._Bool(params.AllocMemory), ) @@ -749,6 +749,9 @@ type Context struct { ctx *C.struct_ggml_context graph *C.struct_ggml_cgraph + // batchSize is a hint to optimize processing + batchSize int + // buft is the buffer type used for new tensors buft C.ggml_backend_buffer_type_t @@ -805,6 +808,10 @@ func (c *Context) Forward(tensors ...ml.Tensor) ml.Context { return c } +func (c *Context) SetBatchSize(batchSize int) { + c.batchSize = batchSize +} + func (c *Context) Compute(tensors ...ml.Tensor) { c.ComputeWithNotify(nil, tensors...) } @@ -815,6 +822,11 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { if cb != nil { go cb() } + + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + if status := C.ggml_backend_sched_graph_compute_async(c.b.sched, c.graph); status != C.GGML_STATUS_SUCCESS { panic(fmt.Errorf("error computing ggml graph: %v", status)) } @@ -836,6 +848,10 @@ func (c *Context) ComputeWithNotify(cb func(), tensors ...ml.Tensor) { } func (c *Context) Reserve() { + if c.batchSize > 0 { + C.ggml_backend_sched_set_batch_size(c.b.sched, C.int(c.batchSize)) + } + reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph) slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched)) diff --git a/ml/backend/ggml/ggml/include/ggml-backend.h b/ml/backend/ggml/ggml/include/ggml-backend.h index 8098352431..1cab4bb3f2 100644 --- a/ml/backend/ggml/ggml/include/ggml-backend.h +++ b/ml/backend/ggml/ggml/include/ggml-backend.h @@ -98,7 +98,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // NOTE: will be removed, use device version instead GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); @@ -317,6 +317,9 @@ extern "C" { GGML_API ggml_backend_sched_t ggml_backend_sched_new_ext(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload, bool alloc_buffers); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + // Provide a hint on the batch size to optimize processing (uses heuristics if unset) + GGML_API void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size); + // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success diff --git a/ml/backend/ggml/ggml/src/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h index 43c91d9f27..21b35ac5c7 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend-impl.h +++ b/ml/backend/ggml/ggml/src/ggml-backend-impl.h @@ -112,8 +112,8 @@ extern "C" { // compute the graph with the plan enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph (always async if supported by the backend) - enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + // compute graph (always async if supported by the backend). batch_size may be -1 if unknown + enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size); // (optional) event synchronization // record an event on this stream diff --git a/ml/backend/ggml/ggml/src/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp index 0b757af594..9b0a9b91ff 100644 --- a/ml/backend/ggml/ggml/src/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -368,14 +368,14 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba } enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph); + enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, -1); ggml_backend_synchronize(backend); return err; } -enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { GGML_ASSERT(backend); - return backend->iface.graph_compute(backend, cgraph); + return backend->iface.graph_compute(backend, cgraph, batch_size); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -750,6 +750,8 @@ struct ggml_backend_sched { bool op_offload; + int batch_size; // a hint on the batch size to optimize processing, -1 to use heuristics + int debug; // allocate buffers on attached ggml_backend_buffer_type_t's and during reservation @@ -848,7 +850,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && (sched->batch_size < 0 || sched->batch_size >= 32) && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1584,7 +1586,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } if (!sched->callback_eval) { - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1606,7 +1608,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv); + enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, sched->batch_size); if (ec != GGML_STATUS_SUCCESS) { return ec; } @@ -1698,6 +1700,7 @@ ggml_backend_sched_t ggml_backend_sched_new_ext( sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); sched->op_offload = op_offload; + sched->batch_size = -1; sched->alloc_buffers = alloc_buffers; ggml_backend_sched_reset(sched); @@ -1734,6 +1737,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched); } +void ggml_backend_sched_set_batch_size(ggml_backend_sched_t sched, int batch_size) { + sched->batch_size = batch_size; +} + void ggml_backend_sched_reset(ggml_backend_sched_t sched) { GGML_ASSERT(sched); // reset state for the next run diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp index 2a9ff7f666..6a38a51a29 100644 --- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp @@ -224,7 +224,7 @@ static void ggml_backend_blas_free(ggml_backend_t backend) { delete backend; } -static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -254,6 +254,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); + GGML_UNUSED(batch_size); } static struct ggml_backend_i blas_backend_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp index 3191faaa4c..32f14c811c 100644 --- a/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -164,7 +164,7 @@ static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backe GGML_UNUSED(backend); } -static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, int batch_size) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool); @@ -184,6 +184,8 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s cplan.abort_callback_data = cpu_ctx->abort_callback_data; return ggml_graph_compute(cgraph, &cplan); + + GGML_UNUSED(batch_size); } static const struct ggml_backend_i ggml_backend_cpu_i = { diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index d62f412d69..e9b73147b3 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2775,31 +2775,19 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { #ifdef USE_CUDA_GRAPH static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, - bool use_cuda_graph) { + int batch_size, bool use_cuda_graph) { // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - // This fix was added in llama.cpp and Ollama in parallel, but with - // different tensor names. - // llama.cpp: https://github.com/ggml-org/llama.cpp/pull/14741 - // ollama: https://github.com/ollama/ollama/pull/11525 - - const std::string gemma3n_per_layer_proj_src1_name_ollama = " (reshaped)"; - const std::string gemma3n_node_name_ollama = "node_"; - const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; - - const std::string ffn_moe_bias_suffix = "_exps.bias"; - const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out"; const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d"; - for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2821,30 +2809,34 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && - node->src[1] && node->src[1]->ne[1] > 1 && - // ollama - // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n - // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here - !(node->ne[0] == 256 && node->ne[2] == 1 && node->ne[3] == 1 && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name_ollama) != std::string::npos : false && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name_ollama : false) && - node->src[1] ? std::string(node->src[1]->name).find(ffn_moe_bias_suffix) == std::string::npos : false && - // upstream - (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && - (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && - strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && - strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && - strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && - strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { - // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation - // by means of matching node names. See - // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and - // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, - // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; + // If we have an explicit batch size hint then we don't need to use the tensor name heuristics + if (batch_size >= 0) { + if (batch_size > 1) { + use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%d]\n", __func__, batch_size); #endif + } + } else { + if (node->op == GGML_OP_ADD && + node->src[1] && node->src[1]->ne[1] > 1 && + (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && + (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && + strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && + strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 && + strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 && + strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) { + // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation + // by means of matching node names. See + // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and + // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; +#ifndef NDEBUG + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); +#endif + } } if (node->op == GGML_OP_CPY) { @@ -3247,7 +3239,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } } -static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; cuda_ctx->pool_set_alloc(true); @@ -3286,7 +3278,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (use_cuda_graph) { cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); - use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph); + use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, batch_size, use_cuda_graph); // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. if (use_cuda_graph && cuda_graph_update_required) { diff --git a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp index f356e4a0aa..032dee76d7 100644 --- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.cpp @@ -410,10 +410,12 @@ static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml GGML_UNUSED(dst); } -static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { ggml_metal_t ctx = (ggml_metal_t)backend->context; return ggml_metal_graph_compute(ctx, cgraph); + + GGML_UNUSED(batch_size); } static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { diff --git a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0bbcecd01f..cc68e79686 100644 --- a/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ml/backend/ggml/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12039,7 +12039,7 @@ static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const stru return num_adds; } -static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, int batch_size) { VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)"); ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; @@ -12235,6 +12235,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg return GGML_STATUS_SUCCESS; UNUSED(backend); + UNUSED(batch_size); } // Sort the graph for improved parallelism. diff --git a/runner/ollamarunner/multimodal.go b/runner/ollamarunner/multimodal.go index 78ceb771c0..6af89021c7 100644 --- a/runner/ollamarunner/multimodal.go +++ b/runner/ollamarunner/multimodal.go @@ -86,6 +86,9 @@ func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Ten computeCtx.Forward(tensors...) entry.data = make([][]float32, len(entry.mm)) + // Multimodal processing is computationally intensive, so treat it similarly to a large batch + computeCtx.SetBatchSize(512) + if !reserve { computeCtx.Compute(tensors...) diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index 962931fec7..3e8c1e2276 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -598,6 +598,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er // Actual batchInputs values will be injected into the batch.Inputs tensor before calling Compute batch.Inputs = nextBatch.ctx.Input().Empty(ml.DTypeI32, len(batchInputs)) batch.Outputs = nextBatch.ctx.Input().FromInts(batchOutputs, len(batchOutputs)) + nextBatch.ctx.SetBatchSize(len(batchInputs)) nextBatch.modelOutput, err = model.Forward(nextBatch.ctx, s.model, batch) if err != nil { err = fmt.Errorf("failed to build graph: %w", err) @@ -1108,6 +1109,7 @@ func (s *Server) reserveWorstCaseGraph(prompt bool) error { return err } + ctx.SetBatchSize(batchSize) ctx.Forward(t).Reserve() return nil