From f804e8a46005b36e6f14577f4226cf2046abce12 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 18 Aug 2025 17:45:40 -0700 Subject: [PATCH] disable output_all (#11959) --- llama/llama.cpp/src/llama-context.cpp | 3 +-- .../0019-Enable-CUDA-Graphs-for-gemma3n.patch | 2 +- .../0023-decode-disable-output_all.patch | 23 +++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 llama/patches/0023-decode-disable-output_all.patch diff --git a/llama/llama.cpp/src/llama-context.cpp b/llama/llama.cpp/src/llama-context.cpp index 26a5cf9c3f..6ece5263bc 100644 --- a/llama/llama.cpp/src/llama-context.cpp +++ b/llama/llama.cpp/src/llama-context.cpp @@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const int64_t n_vocab = vocab.n_tokens(); const int64_t n_embd = hparams.n_embd; - // when computing embeddings, all tokens are output - const bool output_all = cparams.embeddings; + const bool output_all = false; if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); diff --git a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch index c8b227c0c8..db1303b32a 100644 --- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch +++ b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch @@ -13,7 +13,7 @@ checks. 1 file changed, 18 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 57eae461..9db0c8b5 100644 +index 57eae461..c7f9dc3a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud diff --git a/llama/patches/0023-decode-disable-output_all.patch b/llama/patches/0023-decode-disable-output_all.patch new file mode 100644 index 0000000000..dc326ae647 --- /dev/null +++ b/llama/patches/0023-decode-disable-output_all.patch @@ -0,0 +1,23 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Mon, 18 Aug 2025 16:58:39 -0700 +Subject: [PATCH] decode: disable output_all + +--- + src/llama-context.cpp | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/src/llama-context.cpp b/src/llama-context.cpp +index 26a5cf9c..6ece5263 100644 +--- a/src/llama-context.cpp ++++ b/src/llama-context.cpp +@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) { + const int64_t n_vocab = vocab.n_tokens(); + const int64_t n_embd = hparams.n_embd; + +- // when computing embeddings, all tokens are output +- const bool output_all = cparams.embeddings; ++ const bool output_all = false; + + if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) { + LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);