From d724caced3ce21f08924d4b7801f94ce6638f6ea Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Wed, 30 Jul 2025 13:55:21 -0400 Subject: [PATCH] fix: Remove Gemma3n CUDA Graphs patch It was implemented upstream: https://github.com/ggml-org/llama.cpp/pull/14741 Branch: GraniteFour Signed-off-by: Gabe Goodhart --- ...ch => 0020-BF16-macos-version-guard.patch} | 0 .../0020-Enable-CUDA-Graphs-for-gemma3n.patch | 50 ------------------- 2 files changed, 50 deletions(-) rename llama/patches/{0021-BF16-macos-version-guard.patch => 0020-BF16-macos-version-guard.patch} (100%) delete mode 100644 llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch diff --git a/llama/patches/0021-BF16-macos-version-guard.patch b/llama/patches/0020-BF16-macos-version-guard.patch similarity index 100% rename from llama/patches/0021-BF16-macos-version-guard.patch rename to llama/patches/0020-BF16-macos-version-guard.patch diff --git a/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch deleted file mode 100644 index b9dd6cdc62..0000000000 --- a/llama/patches/0020-Enable-CUDA-Graphs-for-gemma3n.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Oliver Simons -Date: Tue, 22 Jul 2025 11:02:28 +0200 -Subject: [PATCH] Enable CUDA Graphs for gemma3n. - -Similar to -https://github.com/ggml-org/llama.cpp/pull/14741, -though ollama has a slightly different model graph -than llama.cpp which requires different workaround -checks. ---- - ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++---- - 1 file changed, 12 insertions(+), 4 deletions(-) - -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 2b9fabf4..28ccf4be 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - // Loop over nodes in GGML graph to obtain info needed for CUDA graph - cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); - -+ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)"; -+ const std::string gemma3n_node_name = "node_"; -+ - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - -@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud - #endif - } - -- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { -- // disable CUDA graphs for batch size > 1 for now. -- // Changes in batch size or context size can cause changes to the grid size of some kernels. -+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n -+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here -+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256 -+ && node->ne[2] == 1 -+ && node->ne[3] == 1 -+ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false -+ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) { -+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. - use_cuda_graph = false; - #ifndef NDEBUG -- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); -+ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); - #endif - } -