From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 18 Aug 2025 16:58:39 -0700 Subject: [PATCH] decode: disable output_all --- src/llama-context.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e7526e7d..53a5e3a9 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -974,8 +974,7 @@ int llama_context::decode(const llama_batch & batch_inp) { const int64_t n_vocab = vocab.n_tokens(); const int64_t n_embd = hparams.n_embd; - // when computing embeddings, all tokens are output - const bool output_all = cparams.embeddings; + const bool output_all = false; if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);