mirror of
https://github.com/ollama/ollama.git
synced 2025-11-11 19:57:43 +01:00
chore: update mllama to use ollama engine (#10637)
This commit is contained in:
6
llama/llama.cpp/include/llama.h
vendored
6
llama/llama.cpp/include/llama.h
vendored
@@ -258,7 +258,6 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
int32_t n_embd;
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -366,7 +365,6 @@ extern "C" {
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
bool cross_attn; // whether to use cross attention
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@@ -466,10 +464,6 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
// TODO (jmorganca): this should most likely be passed in as part of a batch
|
||||
// and not set on the context for all batches.
|
||||
LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user