mirror of
https://github.com/ollama/ollama.git
synced 2025-03-19 14:21:57 +01:00
162 lines
7.8 KiB
C
Vendored
162 lines
7.8 KiB
C
Vendored
/**
|
|
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
|
*
|
|
* MIT License
|
|
*
|
|
* Copyright (c) 2023-2024 The ggml authors
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
* in the Software without restriction, including without limitation the rights
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in all
|
|
* copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "ggml.h"
|
|
#include "ggml-backend.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
|
// since https://github.com/ggerganov/ggml/issues/287
|
|
struct ggml_cplan {
|
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
|
|
|
int n_threads;
|
|
struct ggml_threadpool * threadpool;
|
|
|
|
// abort ggml_graph_compute when true
|
|
ggml_abort_callback abort_callback;
|
|
void * abort_callback_data;
|
|
};
|
|
|
|
// numa strategies
|
|
enum ggml_numa_strategy {
|
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
|
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
|
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
|
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
|
GGML_NUMA_STRATEGY_MIRROR = 4,
|
|
GGML_NUMA_STRATEGY_COUNT
|
|
};
|
|
|
|
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
|
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
|
|
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
|
|
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
|
|
|
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
|
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
|
|
|
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
|
|
|
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
|
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
|
|
|
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
|
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
|
|
|
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
|
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
|
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
|
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
|
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
|
|
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
|
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
|
const struct ggml_cgraph * cgraph,
|
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
|
|
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
|
|
|
//
|
|
// system info
|
|
//
|
|
|
|
// x86
|
|
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
|
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
|
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
|
// ARM
|
|
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
|
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
|
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
|
// other
|
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
|
|
// Internal types and functions exposed for tests and benchmarks
|
|
|
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
|
|
|
struct ggml_type_traits_cpu {
|
|
ggml_from_float_t from_float;
|
|
ggml_vec_dot_t vec_dot;
|
|
enum ggml_type vec_dot_type;
|
|
int64_t nrows; // number of rows to process simultaneously
|
|
};
|
|
|
|
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
|
|
|
GGML_BACKEND_API void ggml_cpu_init(void);
|
|
|
|
//
|
|
// CPU backend
|
|
//
|
|
|
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
|
|
|
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
|
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
|
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
|
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
|
|
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|