From 7555ea44f81843a65d373e2bd20936adaea67c28 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 20 Dec 2023 10:36:01 -0800 Subject: [PATCH] Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. --- Dockerfile.build | 2 +- gpu/gpu.go | 6 +- gpu/gpu_darwin.go | 1 + gpu/types.go | 1 + llm/{rocm_shim.c => dynamic_shim.c} | 70 ++++++++-------- llm/dynamic_shim.h | 74 +++++++++++++++++ llm/ext_server.go | 7 +- llm/llama.cpp/gen_common.sh | 10 +-- llm/llama.cpp/gen_linux.sh | 98 +++++++++++----------- llm/llama.go | 60 +++++++------- llm/llm.go | 23 ++++-- llm/rocm_shim.h | 73 ----------------- llm/shim_darwin.go | 6 +- llm/shim_ext_server.go | 121 ++++++++++++---------------- 14 files changed, 272 insertions(+), 280 deletions(-) rename llm/{rocm_shim.c => dynamic_shim.c} (55%) create mode 100644 llm/dynamic_shim.h delete mode 100644 llm/rocm_shim.h diff --git a/Dockerfile.build b/Dockerfile.build index 5499b0a19..6b7e3c4df 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64 ARG CUDA_VERSION=11.3.1-1 ARG CMAKE_VERSION=3.22.1 # ROCm only supports amd64 -ARG ROCM_VERSION=5.7 +ARG ROCM_VERSION=6.0 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html RUN apt-get update && \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d03812c8f..91ced3a80 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo { } var memInfo C.mem_info_t - resp := GpuInfo{"", 0, 0} + resp := GpuInfo{"", "", 0, 0} if gpuHandles.cuda != nil { C.cuda_check_vram(*gpuHandles.cuda, &memInfo) if memInfo.err != nil { @@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "CUDA" + resp.Library = "cuda_server" } } else if gpuHandles.rocm != nil { C.rocm_check_vram(*gpuHandles.rocm, &memInfo) @@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo { C.free(unsafe.Pointer(memInfo.err)) } else { resp.Driver = "ROCM" + resp.Library = "rocm_server" } } if resp.Driver == "" { C.cpu_check_ram(&memInfo) resp.Driver = "CPU" + // In the future we may offer multiple CPU variants to tune CPU features + resp.Library = "default" } if memInfo.err != nil { log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err)) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 14bd2655e..ccf67b517 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo { return GpuInfo{ Driver: "METAL", + Library: "default", TotalMemory: 0, FreeMemory: 0, } diff --git a/gpu/types.go b/gpu/types.go index a84a0a8d5..a56da45ea 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -3,6 +3,7 @@ package gpu // Beginning of an `ollama info` command type GpuInfo struct { Driver string `json:"driver,omitempty"` + Library string `json:"library,omitempty"` TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` diff --git a/llm/rocm_shim.c b/llm/dynamic_shim.c similarity index 55% rename from llm/rocm_shim.c rename to llm/dynamic_shim.c index e8304aa02..8b5d67c91 100644 --- a/llm/rocm_shim.c +++ b/llm/dynamic_shim.c @@ -1,4 +1,4 @@ -#include "rocm_shim.h" +#include "dynamic_shim.h" #include #include @@ -28,8 +28,8 @@ inline static char *LOAD_ERR() { #define UNLOAD_LIBRARY(handle) dlclose(handle) #endif -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err) { +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err) { int i = 0; struct lookup { char *s; @@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); if (!s->handle) { err->id = -1; - snprintf( - err->msg, err->msg_len, - "Unable to load rocm server library: %s (If you have a Radeon card, " - "did you install the ROCM libraries?)", - LOAD_ERR()); + snprintf(err->msg, err->msg_len, + "Unable to load dynamic server library: %s", LOAD_ERR()); return; } @@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, } } -inline void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err) { s.llama_server_init(sparams, err); } -inline void rocm_shim_llama_server_start(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) { s.llama_server_start(); } -inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) { +inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) { s.llama_server_stop(); } -inline void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp) { +inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp) { s.llama_server_completion(json_req, resp); } -inline void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, +inline void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, ext_server_task_result_t *result) { s.llama_server_completion_next_result(task_id, result); } -inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_completion_cancel( + struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { s.llama_server_completion_cancel(task_id, err); } -inline void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result) { +inline void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result) { s.llama_server_release_task_result(result); } -inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_tokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_detokenize(json_req, json_resp, err); } -inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { +inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { s.llama_server_embedding(json_req, json_resp, err); } -inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp) { +inline void dynamic_shim_llama_server_release_json_resp( + struct dynamic_llama_server s, char **json_resp) { s.llama_server_release_json_resp(json_resp); } diff --git a/llm/dynamic_shim.h b/llm/dynamic_shim.h new file mode 100644 index 000000000..5e4e78b76 --- /dev/null +++ b/llm/dynamic_shim.h @@ -0,0 +1,74 @@ +#include + +#include "server.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct dynamic_llama_server { + void *handle; + void (*llama_server_init)(ext_server_params_t *sparams, + ext_server_resp_t *err); + void (*llama_server_start)(); + void (*llama_server_stop)(); + void (*llama_server_completion)(const char *json_req, + ext_server_resp_t *resp); + void (*llama_server_completion_next_result)(const int task_id, + ext_server_task_result_t *result); + void (*llama_server_completion_cancel)(const int task_id, + ext_server_resp_t *err); + void (*llama_server_release_task_result)(ext_server_task_result_t *result); + void (*llama_server_tokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_detokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_embedding)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_release_json_resp)(char **json_resp); +}; + +void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s, + ext_server_resp_t *err); + +// No good way to call C function pointers from Go so inline the indirection +void dynamic_shim_llama_server_init(struct dynamic_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_start(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_stop(struct dynamic_llama_server s); + +void dynamic_shim_llama_server_completion(struct dynamic_llama_server s, + const char *json_req, + ext_server_resp_t *resp); + +void dynamic_shim_llama_server_completion_next_result( + struct dynamic_llama_server s, const int task_id, + ext_server_task_result_t *result); + +void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s, + const int task_id, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_release_task_result( + struct dynamic_llama_server s, ext_server_task_result_t *result); + +void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err); + +void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); +void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s, + char **json_resp); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/llm/ext_server.go b/llm/ext_server.go index ab74eb00a..048b1a65c 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -17,7 +17,10 @@ package llm #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin #cgo windows LDFLAGS: -lext_server_shared -lpthread @@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { C.llama_server_release_json_resp(json_resp) } -func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { server := &llamaExtServer{opts} return newExtServer(server, model, adapters, projectors, numLayers, opts) } diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh index 83a21cf91..c6b84f7da 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/llama.cpp/gen_common.sh @@ -6,7 +6,7 @@ init_vars() { CMAKE_DEFS="-DLLAMA_ACCELERATE=on" # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" - if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then + if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" else # TODO - add additional optimization flags... @@ -15,7 +15,7 @@ init_vars() { } git_module_setup() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule initialization" return fi @@ -25,13 +25,13 @@ git_module_setup() { } apply_patches() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then + if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then echo "Skipping submodule patching" return fi # Workaround git apply not handling creation well for iteration rm -f gguf/examples/server/server.h - for patch in ${PATCHES} ; do + for patch in ${PATCHES}; do git -C gguf apply ../patches/${patch} done } @@ -39,4 +39,4 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 -} \ No newline at end of file +} diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index 3608ddd6e..e3cb87a84 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -1,81 +1,81 @@ #!/bin/bash # This script is intended to run inside the go generate -# working directory must be ../llm/llama.cpp +# working directory must be llm/llama.cpp + +# First we build our default built-in library which will be linked into the CGO +# binary as a normal dependency. This default build is CPU based. +# +# Then we build a CUDA dynamic library (although statically linked with the CUDA +# library dependencies for maximum portability) +# +# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly +# important to be a dynamic lib even if it's the only GPU library detected because +# we can't redistribute the objectfiles but must rely on dynamic libraries at +# runtime, which could lead the server not to start if not present. set -ex set -o pipefail echo "Starting linux generate script" -if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then +if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then export CUDACXX=/usr/local/cuda/bin/nvcc fi +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" +OLLAMA_DYN_LIB_DIR="gguf/build/lib" +mkdir -p ${OLLAMA_DYN_LIB_DIR} +touch ${OLLAMA_DYN_LIB_DIR}/.generated source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches -if [ -d /usr/local/cuda/lib64/ ] ; then - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -else - CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -fi -BUILD_DIR="gguf/build/cuda" -LIB_DIR="${BUILD_DIR}/lib" -mkdir -p ../../dist/ + +# +# CPU first for the default library +# +CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cpu" build -if [ -d /usr/local/cuda/lib64/ ] ; then - pwd - ar -M < - -#include "server.h" - -#ifdef __cplusplus -extern "C" { -#endif -struct rocm_llama_server { - void *handle; - void (*llama_server_init)(ext_server_params_t *sparams, - ext_server_resp_t *err); - void (*llama_server_start)(); - void (*llama_server_stop)(); - void (*llama_server_completion)(const char *json_req, - ext_server_resp_t *resp); - void (*llama_server_completion_next_result)(const int task_id, - ext_server_task_result_t *result); - void (*llama_server_completion_cancel)(const int task_id, - ext_server_resp_t *err); - void (*llama_server_release_task_result)(ext_server_task_result_t *result); - void (*llama_server_tokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_detokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_embedding)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_release_json_resp)(char **json_resp); -}; - -void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, - ext_server_resp_t *err); - -// No good way to call C function pointers from Go so inline the indirection -void rocm_shim_llama_server_init(struct rocm_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err); - -void rocm_shim_llama_server_start(struct rocm_llama_server s); - -void rocm_shim_llama_server_stop(struct rocm_llama_server s); - -void rocm_shim_llama_server_completion(struct rocm_llama_server s, - const char *json_req, - ext_server_resp_t *resp); - -void rocm_shim_llama_server_completion_next_result( - struct rocm_llama_server s, const int task_id, - ext_server_task_result_t *result); - -void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, - const int task_id, - ext_server_resp_t *err); - -void rocm_shim_llama_server_release_task_result( - struct rocm_llama_server s, ext_server_task_result_t *result); - -void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void rocm_shim_llama_server_embedding(struct rocm_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); -void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, - char **json_resp); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index f63ce8c89..98e7a7d5e 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -12,13 +12,13 @@ import ( //go:embed llama.cpp/gguf/ggml-metal.metal var libEmbed embed.FS -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { // should never happen... - return nil, fmt.Errorf("ROCM GPUs not supported on Mac") + return nil, fmt.Errorf("Dynamic library loading not supported on Mac") } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal") + _, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal") if err != nil { if err == payloadMissing { // TODO perhaps consider this a hard failure on arm macs? diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index fa841d491..d9c2df46a 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -5,7 +5,7 @@ package llm /* #include -#include "rocm_shim.h" +#include "dynamic_shim.h" */ import "C" @@ -18,20 +18,20 @@ import ( "log" "os" "path/filepath" - "runtime" + "strings" "sync" "unsafe" "github.com/jmorganca/ollama/api" ) -//go:embed llama.cpp/gguf/build/*/lib/* +//go:embed llama.cpp/gguf/build/lib/* var libEmbed embed.FS var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") type shimExtServer struct { - s C.struct_rocm_llama_server + s C.struct_dynamic_llama_server options api.Options } @@ -40,50 +40,58 @@ var shimMutex sync.Mutex var llm *shimExtServer func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_init(llm.s, sparams, err) + C.dynamic_shim_llama_server_init(llm.s, sparams, err) } func (llm *shimExtServer) llama_server_start() { - C.rocm_shim_llama_server_start(llm.s) + C.dynamic_shim_llama_server_start(llm.s) } func (llm *shimExtServer) llama_server_stop() { - C.rocm_shim_llama_server_stop(llm.s) + C.dynamic_shim_llama_server_stop(llm.s) } func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion(llm.s, json_req, resp) + C.dynamic_shim_llama_server_completion(llm.s, json_req, resp) } func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp) + C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp) } func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err) + C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err) } func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { - C.rocm_shim_llama_server_release_task_result(llm.s, result) + C.dynamic_shim_llama_server_release_task_result(llm.s, result) } func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { - C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err) + C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err) } func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { - C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp) + C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp) } -func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { - if !ShimPresent { - return nil, RocmShimMissing +func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + shimMutex.Lock() + defer shimMutex.Unlock() + libPath := C.CString(library) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_dynamic_llama_server + C.dynamic_shim_init(libPath, &srv, &resp) + if resp.id < 0 { + return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg)) } - log.Printf("Loading ROCM llm server") - if llm == nil { - return nil, fmt.Errorf("nativeInit wasnt called or libary load failed") + llm = &shimExtServer{ + s: srv, + options: opts, } - llm.options = opts + log.Printf("Loading Dynamic Shim llm server: %s", library) return newExtServer(llm, model, adapters, projectors, numLayers, opts) } @@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() { } func nativeInit(workdir string) error { - err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*") + libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*") if err != nil { if err == payloadMissing { - log.Printf("%s", RocmShimMissing) + log.Printf("%s", payloadMissing) return nil } return err - } else { - ShimPresent = true + } + for _, lib := range libs { + libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0] + AvailableShims[libName] = lib } - // Verify we have permissions - either running as root, or we have group access to the driver - fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) - if err != nil { - if errors.Is(err, fs.ErrPermission) { - log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") - return err - } else if errors.Is(err, fs.ErrNotExist) { - // expected behavior without a radeon card - return nil + // Only check ROCm access if we have the dynamic lib loaded + if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + return err + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } + fd.Close() - return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) } - fd.Close() - shimMutex.Lock() - defer shimMutex.Unlock() - if llm != nil { - return nil - } - var libName string - switch runtime.GOOS { - case "darwin": - // shouldn't happen - return nil - case "linux": - libName = "librocm_server.so" - case "windows": - libName = "rocm_server.dll" - default: - // shouldn't happen - return nil - } - libPath := C.CString(filepath.Join(workdir, libName)) - defer C.free(unsafe.Pointer(libPath)) - resp := newExtServerResp(128) - defer freeExtServerResp(resp) - var srv C.struct_rocm_llama_server - C.rocm_shim_init(libPath, &srv, &resp) - if resp.id < 0 { - // TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm - // and run against CPU - return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg)) - } - llm = &shimExtServer{ - s: srv, - options: api.DefaultOptions(), - } return nil }