Revamp the dynamic library shim

This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped.
2025-07-12 18:12:44 +02:00 · 2023-12-20 10:36:01 -08:00
parent 1d1eb1688c
commit 7555ea44f8
14 changed files with 272 additions and 280 deletions
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@ -5,7 +5,7 @@ package llm
 /*

 #include <stdlib.h>
-#include "rocm_shim.h"
+#include "dynamic_shim.h"

 */
 import "C"
@ -18,20 +18,20 @@ import (
 	"log"
 	"os"
 	"path/filepath"
-	"runtime"
+	"strings"
 	"sync"
 	"unsafe"

 	"github.com/jmorganca/ollama/api"
 )

-//go:embed llama.cpp/gguf/build/*/lib/*
+//go:embed llama.cpp/gguf/build/lib/*
 var libEmbed embed.FS

 var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")

 type shimExtServer struct {
-	s       C.struct_rocm_llama_server
+	s       C.struct_dynamic_llama_server
 	options api.Options
 }

@ -40,50 +40,58 @@ var shimMutex sync.Mutex
 var llm *shimExtServer

 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_init(llm.s, sparams, err)
+	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
 func (llm *shimExtServer) llama_server_start() {
-	C.rocm_shim_llama_server_start(llm.s)
+	C.dynamic_shim_llama_server_start(llm.s)
 }
 func (llm *shimExtServer) llama_server_stop() {
-	C.rocm_shim_llama_server_stop(llm.s)
+	C.dynamic_shim_llama_server_stop(llm.s)
 }

 func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
+	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
 }
 func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
-	C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
+	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
 }
 func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
+	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
 }
 func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
-	C.rocm_shim_llama_server_release_task_result(llm.s, result)
+	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
 }

 func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
-	C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
+	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
 }
 func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
-	C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
+	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
 }

-func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
-	if !ShimPresent {
-		return nil, RocmShimMissing
+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
+	shimMutex.Lock()
+	defer shimMutex.Unlock()
+	libPath := C.CString(library)
+	defer C.free(unsafe.Pointer(libPath))
+	resp := newExtServerResp(128)
+	defer freeExtServerResp(resp)
+	var srv C.struct_dynamic_llama_server
+	C.dynamic_shim_init(libPath, &srv, &resp)
+	if resp.id < 0 {
+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
 	}
-	log.Printf("Loading ROCM llm server")
-	if llm == nil {
-		return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
+	llm = &shimExtServer{
+		s:       srv,
+		options: opts,
 	}
-	llm.options = opts
+	log.Printf("Loading Dynamic Shim llm server: %s", library)
 	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
 }

@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() {
 }

 func nativeInit(workdir string) error {
-	err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
 	if err != nil {
 		if err == payloadMissing {
-			log.Printf("%s", RocmShimMissing)
+			log.Printf("%s", payloadMissing)
 			return nil
 		}
 		return err
-	} else {
-		ShimPresent = true
+	}
+	for _, lib := range libs {
+		libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
+		AvailableShims[libName] = lib
 	}

-	// Verify we have permissions - either running as root, or we have group access to the driver
-	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
-	if err != nil {
-		if errors.Is(err, fs.ErrPermission) {
-			log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
-			return err
-		} else if errors.Is(err, fs.ErrNotExist) {
-			// expected behavior without a radeon card
-			return nil
+	// Only check ROCm access if we have the dynamic lib loaded
+	if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
+		// Verify we have permissions - either running as root, or we have group access to the driver
+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
+		if err != nil {
+			if errors.Is(err, fs.ErrPermission) {
+				log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
+				return err
+			} else if errors.Is(err, fs.ErrNotExist) {
+				// expected behavior without a radeon card
+				return nil
+			}
+
+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 		}
+		fd.Close()

-		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
 	}
-	fd.Close()

-	shimMutex.Lock()
-	defer shimMutex.Unlock()
-	if llm != nil {
-		return nil
-	}
-	var libName string
-	switch runtime.GOOS {
-	case "darwin":
-		// shouldn't happen
-		return nil
-	case "linux":
-		libName = "librocm_server.so"
-	case "windows":
-		libName = "rocm_server.dll"
-	default:
-		// shouldn't happen
-		return nil
-	}
-	libPath := C.CString(filepath.Join(workdir, libName))
-	defer C.free(unsafe.Pointer(libPath))
-	resp := newExtServerResp(128)
-	defer freeExtServerResp(resp)
-	var srv C.struct_rocm_llama_server
-	C.rocm_shim_init(libPath, &srv, &resp)
-	if resp.id < 0 {
-		// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
-		//        and run against CPU
-		return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
-	}
-	llm = &shimExtServer{
-		s:       srv,
-		options: api.DefaultOptions(),
-	}
 	return nil
 }