diff --git a/llama/patches/0019-metal-add-mean-kernel-14267.patch b/llama/patches/0019-metal-add-mean-kernel-14267.patch index a52f0fdfea..e65aeb7b47 100644 --- a/llama/patches/0019-metal-add-mean-kernel-14267.patch +++ b/llama/patches/0019-metal-add-mean-kernel-14267.patch @@ -16,7 +16,7 @@ ggml-ci 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index ee4f2dcb..f20f5615 100644 +index a9eeebc6..110c9ece 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -489,6 +489,7 @@ enum ggml_metal_kernel_type { diff --git a/llama/patches/0020-CUDA-add-mean-operation-14313.patch b/llama/patches/0020-CUDA-add-mean-operation-14313.patch index efcb1e8bca..2f4e37949b 100644 --- a/llama/patches/0020-CUDA-add-mean-operation-14313.patch +++ b/llama/patches/0020-CUDA-add-mean-operation-14313.patch @@ -52,7 +52,7 @@ index 64fb4ff4..5b9a0fe3 100644 static __device__ __forceinline__ float warp_reduce_max(float x) { #pragma unroll diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 4c829153..9e64e5ae 100644 +index d6960174..2b9fabf4 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -35,6 +35,7 @@ diff --git a/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch new file mode 100644 index 0000000000..b9dd6cdc62 --- /dev/null +++ b/llama/patches/0021-Enable-CUDA-Graphs-for-gemma3n.patch @@ -0,0 +1,50 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Oliver Simons +Date: Tue, 22 Jul 2025 11:02:28 +0200 +Subject: [PATCH] Enable CUDA Graphs for gemma3n. + +Similar to +https://github.com/ggml-org/llama.cpp/pull/14741, +though ollama has a slightly different model graph +than llama.cpp which requires different workaround +checks. +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 2b9fabf4..28ccf4be 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + ++ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; ++ const std::string gemma3n_node_name = "node_"; ++ + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + +@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud + #endif + } + +- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { +- // disable CUDA graphs for batch size > 1 for now. +- // Changes in batch size or context size can cause changes to the grid size of some kernels. ++ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n ++ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here ++ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 ++ && node->ne[2] == 1 ++ && node->ne[3] == 1 ++ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false ++ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { ++ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; + #ifndef NDEBUG +- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); ++ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + #endif + } + diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 2b9fabf4f5..28ccf4beff 100644 --- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud // Loop over nodes in GGML graph to obtain info needed for CUDA graph cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); + const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; + const std::string gemma3n_node_name = "node_"; + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud #endif } - if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { - // disable CUDA graphs for batch size > 1 for now. - // Changes in batch size or context size can cause changes to the grid size of some kernels. + // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n + // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 + && node->ne[2] == 1 + && node->ne[3] == 1 + && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false + && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { + // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } diff --git a/model/models/gemma3n/model_text.go b/model/models/gemma3n/model_text.go index 715b8a0eac..b75a2abb37 100644 --- a/model/models/gemma3n/model_text.go +++ b/model/models/gemma3n/model_text.go @@ -203,10 +203,9 @@ func (a AltUp) Predict(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions coefficients := a.PredictionCoefficient.Forward(ctx, modalities) coefficients = coefficients.Reshape(ctx, opts.altupInputs, opts.altupInputs, coefficients.Dim(1), coefficients.Dim(2)) - hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx) - predictions := coefficients.Mulmat(ctx, hiddenStates) - predictions = predictions.Add(ctx, hiddenStates) - return predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + predictions := coefficients.Mulmat(ctx, hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)) + predictions = predictions.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx) + return predictions.Add(ctx, hiddenStates) } func (a AltUp) Correct(ctx ml.Context, predictions, activated, one ml.Tensor, opts *TextOptions) ml.Tensor {