mirror of
https://github.com/ollama/ollama.git
synced 2025-03-26 17:51:48 +01:00
llama: Ensure KV cache is fully defragmented.
Sometimes the KV cache requires defragmentation even without triggering the threshold heuristic. In this case, decoding will not being able to find a KV cache slot. This is particularly difficult for the caller to handle if it happens in between ubatches. To avoid this, we should immediately trigger a defrag. In addition, a heavily fragmented cache can require more than max_moves to defragment. Currently, we stop when we hit the limit but this can leave a cache that still does not have adequate space even after defragmentation is triggered. Instead, we should do multiple batches of processing until everything is complete. Fixes #7949
This commit is contained in:
parent
2ddc32d5c5
commit
08a832b482
99
llama/llama.cpp
vendored
99
llama/llama.cpp
vendored
@ -3051,6 +3051,13 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
// block of KV slots to move when defragging
|
||||
struct llama_kv_defrag_move {
|
||||
uint32_t src;
|
||||
uint32_t dst;
|
||||
uint32_t len;
|
||||
};
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@ -10828,35 +10835,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (const auto & move : moves) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@ -10864,31 +10859,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
||||
ggml_row_size(kv_self.v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
||||
ggml_row_size(kv_self.v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@ -17351,7 +17344,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@ -17361,7 +17354,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
struct ggml_cgraph * result = llm.build_defrag(ids);
|
||||
struct ggml_cgraph * result = llm.build_defrag(moves);
|
||||
|
||||
llm.free();
|
||||
|
||||
@ -18377,7 +18370,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
if (!slot) {
|
||||
llama_kv_cache_defrag(kv_self);
|
||||
llama_kv_cache_update(&lctx);
|
||||
slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
}
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@ -18782,8 +18780,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// number of cells moved
|
||||
uint32_t n_moves = 0;
|
||||
// groups of cells moved
|
||||
std::vector<struct llama_kv_defrag_move> moves;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@ -18847,19 +18845,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
// should we stop searching for the next move?
|
||||
bool stop = false;
|
||||
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = kv_self.cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
if (n_moves == max_moves) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@ -18875,8 +18865,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
n_moves++;
|
||||
moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
} else {
|
||||
moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@ -18886,22 +18878,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
if (stop || n_moves == max_moves) {
|
||||
break;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
if (n_moves == 0) {
|
||||
if (moves.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@ -18976,11 +18962,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched.get());
|
||||
for (std::size_t i = 0; i < moves.size(); i += max_moves) {
|
||||
std::vector<struct llama_kv_defrag_move> chunk;
|
||||
auto end = std::min(i + max_moves, moves.size());
|
||||
chunk.assign(moves.begin() + i, moves.begin() + end);
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
ggml_backend_sched_reset(lctx.sched.get());
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
}
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
|
@ -0,0 +1,242 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Fri, 13 Dec 2024 16:11:59 -0800
|
||||
Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
|
||||
|
||||
Sometimes the KV cache requires defragmentation even without
|
||||
triggering the threshold heuristic. In this case, decoding
|
||||
will not being able to find a KV cache slot. This is particularly
|
||||
difficult for the caller to handle if it happens in between
|
||||
ubatches. To avoid this, we should immediately trigger a defrag.
|
||||
|
||||
In addition, a heavily fragmented cache can require more than
|
||||
max_moves to defragment. Currently, we stop when we hit the limit
|
||||
but this can leave a cache that still does not have adequate space
|
||||
even after defragmentation is triggered. Instead, we should do
|
||||
multiple batches of processing until everything is complete.
|
||||
---
|
||||
src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
|
||||
1 file changed, 46 insertions(+), 53 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4778a9ed..654e32bc 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -3025,6 +3025,13 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
+// block of KV slots to move when defragging
|
||||
+struct llama_kv_defrag_move {
|
||||
+ uint32_t src;
|
||||
+ uint32_t dst;
|
||||
+ uint32_t len;
|
||||
+};
|
||||
+
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
@@ -10802,35 +10809,23 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
- struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
+ struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
- const uint32_t id = ids[i];
|
||||
-
|
||||
- if (i == id || id == ids.size()) {
|
||||
- continue;
|
||||
- }
|
||||
-
|
||||
- uint32_t nm = 1;
|
||||
-
|
||||
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
- nm++;
|
||||
- }
|
||||
-
|
||||
+ for (const auto & move : moves) {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
||||
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
- n_embd_k_gqa, nm,
|
||||
+ n_embd_k_gqa, move.len,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
||||
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -10838,31 +10833,29 @@ struct llm_build_context {
|
||||
if (flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- n_embd_v_gqa, nm,
|
||||
+ n_embd_v_gqa, move.len,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, i));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, move.src));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
- nm, n_embd_v_gqa,
|
||||
+ move.len, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
- ggml_row_size(kv_self.v_l[il]->type, id));
|
||||
+ ggml_row_size(kv_self.v_l[il]->type, move.dst));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
-
|
||||
- i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
@@ -17325,7 +17318,7 @@ struct llm_build_context {
|
||||
}
|
||||
};
|
||||
|
||||
-static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
+static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||
llama_ubatch dummy = {};
|
||||
dummy.equal_seqs = true;
|
||||
|
||||
@@ -17335,7 +17328,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
||||
|
||||
llm.init();
|
||||
|
||||
- struct ggml_cgraph * result = llm.build_defrag(ids);
|
||||
+ struct ggml_cgraph * result = llm.build_defrag(moves);
|
||||
|
||||
llm.free();
|
||||
|
||||
@@ -18351,7 +18344,12 @@ static int llama_decode_internal(
|
||||
kv_self.head = 0;
|
||||
}
|
||||
|
||||
- const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ if (!slot) {
|
||||
+ llama_kv_cache_defrag(kv_self);
|
||||
+ llama_kv_cache_update(&lctx);
|
||||
+ slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||
+ }
|
||||
if (!slot) {
|
||||
return 1;
|
||||
}
|
||||
@@ -18756,8 +18754,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
- // number of cells moved
|
||||
- uint32_t n_moves = 0;
|
||||
+ // groups of cells moved
|
||||
+ std::vector<struct llama_kv_defrag_move> moves;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
@@ -18821,19 +18819,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
- // should we stop searching for the next move?
|
||||
- bool stop = false;
|
||||
-
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = kv_self.cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
- if (n_moves == max_moves) {
|
||||
- stop = true;
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
@@ -18849,8 +18839,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
- n_moves++;
|
||||
+ moves.push_back({i1, i0 + nf, 1});
|
||||
cont = true;
|
||||
+ } else {
|
||||
+ moves.back().len++;
|
||||
}
|
||||
|
||||
nf++;
|
||||
@@ -18860,22 +18852,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
}
|
||||
}
|
||||
|
||||
- if (stop || n_moves == max_moves) {
|
||||
- break;
|
||||
- }
|
||||
-
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
- if (n_moves == 0) {
|
||||
+ if (moves.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
- //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
-
|
||||
- //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
@@ -18950,11 +18936,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
#else
|
||||
// ggml_graph defrag
|
||||
|
||||
- ggml_backend_sched_reset(lctx.sched.get());
|
||||
+ for (std::size_t i = 0; i < moves.size(); i += max_moves) {
|
||||
+ std::vector<struct llama_kv_defrag_move> chunk;
|
||||
+ auto end = std::min(i + max_moves, moves.size());
|
||||
+ chunk.assign(moves.begin() + i, moves.begin() + end);
|
||||
|
||||
- ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
+ ggml_backend_sched_reset(lctx.sched.get());
|
||||
+
|
||||
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
|
||||
+ ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
|
||||
|
||||
- llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
+ llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
+ }
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
@ -433,14 +433,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
|
||||
err := s.lc.Decode(batch)
|
||||
if err != nil {
|
||||
if errors.Is(err, llama.ErrKvCacheFull) {
|
||||
slog.Debug("defragmenting kv cache")
|
||||
s.cache.lc.KvCacheDefrag()
|
||||
err = s.lc.Decode(batch)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
}
|
||||
return fmt.Errorf("failed to decode batch: %w", err)
|
||||
}
|
||||
|
||||
if crossAttention {
|
||||
|
Loading…
x
Reference in New Issue
Block a user