chore: update mllama to use ollama engine (#10637)

2025-11-11 19:57:43 +01:00 · 2025-05-13 17:36:02 -07:00
parent 0478d440f0
commit 23125648b8
67 changed files with 785 additions and 4354 deletions
--- a/llama/llama.cpp/include/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -258,7 +258,6 @@ extern "C" {

        llama_token  *  token;
        float        *  embd;
-        int32_t         n_embd;
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
@@ -366,7 +365,6 @@ extern "C" {
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        bool op_offload;  // whether to offload host tensor operations to device
-        bool cross_attn;  // whether to use cross attention
    };

    // model quantization parameters
@@ -466,10 +464,6 @@ extern "C" {
            struct llama_context_params   params),
            "use llama_init_from_model instead");

-    // TODO (jmorganca): this should most likely be passed in as part of a batch
-    // and not set on the context for all batches.
-    LLAMA_API void llama_set_cross_attention(struct llama_context * ctx, bool cross_attn_state);
-
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);