Revamp the dynamic library shim

This switches the default llama.cpp to be CPU based, and builds the GPU variants
as dynamically loaded libraries which we can select at runtime.

This also bumps the ROCm library to version 6 given 5.7 builds don't work
on the latest ROCm library that just shipped.
This commit is contained in:
Daniel Hiltgen
2023-12-20 10:36:01 -08:00
parent 1d1eb1688c
commit 7555ea44f8
14 changed files with 272 additions and 280 deletions

View File

@ -5,7 +5,7 @@ package llm
/*
#include <stdlib.h>
#include "rocm_shim.h"
#include "dynamic_shim.h"
*/
import "C"
@ -18,20 +18,20 @@ import (
"log"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
//go:embed llama.cpp/gguf/build/*/lib/*
//go:embed llama.cpp/gguf/build/lib/*
var libEmbed embed.FS
var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
type shimExtServer struct {
s C.struct_rocm_llama_server
s C.struct_dynamic_llama_server
options api.Options
}
@ -40,50 +40,58 @@ var shimMutex sync.Mutex
var llm *shimExtServer
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_init(llm.s, sparams, err)
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
}
func (llm *shimExtServer) llama_server_start() {
C.rocm_shim_llama_server_start(llm.s)
C.dynamic_shim_llama_server_start(llm.s)
}
func (llm *shimExtServer) llama_server_stop() {
C.rocm_shim_llama_server_stop(llm.s)
C.dynamic_shim_llama_server_stop(llm.s)
}
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
}
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
}
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
}
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.rocm_shim_llama_server_release_task_result(llm.s, result)
C.dynamic_shim_llama_server_release_task_result(llm.s, result)
}
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
}
func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
if !ShimPresent {
return nil, RocmShimMissing
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
shimMutex.Lock()
defer shimMutex.Unlock()
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dynamic_shim_init(libPath, &srv, &resp)
if resp.id < 0 {
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
log.Printf("Loading ROCM llm server")
if llm == nil {
return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
llm = &shimExtServer{
s: srv,
options: opts,
}
llm.options = opts
log.Printf("Loading Dynamic Shim llm server: %s", library)
return newExtServer(llm, model, adapters, projectors, numLayers, opts)
}
@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() {
}
func nativeInit(workdir string) error {
err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*")
libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
if err != nil {
if err == payloadMissing {
log.Printf("%s", RocmShimMissing)
log.Printf("%s", payloadMissing)
return nil
}
return err
} else {
ShimPresent = true
}
for _, lib := range libs {
libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
AvailableShims[libName] = lib
}
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
return err
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
// Only check ROCm access if we have the dynamic lib loaded
if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
return err
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
shimMutex.Lock()
defer shimMutex.Unlock()
if llm != nil {
return nil
}
var libName string
switch runtime.GOOS {
case "darwin":
// shouldn't happen
return nil
case "linux":
libName = "librocm_server.so"
case "windows":
libName = "rocm_server.dll"
default:
// shouldn't happen
return nil
}
libPath := C.CString(filepath.Join(workdir, libName))
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_rocm_llama_server
C.rocm_shim_init(libPath, &srv, &resp)
if resp.id < 0 {
// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
// and run against CPU
return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
}
llm = &shimExtServer{
s: srv,
options: api.DefaultOptions(),
}
return nil
}