mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 04:28:05 +01:00
Fix embeddings memory corruption (#6467)
* Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+)
This commit is contained in:
8
llm/ext_server/server.cpp
vendored
8
llm/ext_server/server.cpp
vendored
@@ -1429,7 +1429,13 @@ struct llama_server_context
|
||||
switch (task.type)
|
||||
{
|
||||
case TASK_TYPE_COMPLETION: {
|
||||
server_slot *slot = prefix_slot(task.data["prompt"]);
|
||||
server_slot *slot = nullptr;
|
||||
if (task.embedding_mode) {
|
||||
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
|
||||
slot = slots[0].available() ? &slots[0] : nullptr;
|
||||
} else {
|
||||
slot = prefix_slot(task.data["prompt"]);
|
||||
}
|
||||
if (slot == nullptr)
|
||||
{
|
||||
// if no slot is available, we defer this task for processing later
|
||||
|
||||
Reference in New Issue
Block a user