build: Make target improvements (#7499)

* llama: wire up builtin runner This adds a new entrypoint into the ollama CLI to run the cgo built runner. On Mac arm64, this will have GPU support, but on all other platforms it will be the lowest common denominator CPU build. After we fully transition to the new Go runners more tech-debt can be removed and we can stop building the "default" runner via make and rely on the builtin always. * build: Make target improvements Add a few new targets and help for building locally. This also adjusts the runner lookup to favor local builds, then runners relative to the executable, and finally payloads. * Support customized CPU flags for runners This implements a simplified custom CPU flags pattern for the runners. When built without overrides, the runner name contains the vector flag we check for (AVX) to ensure we don't try to run on unsupported systems and crash. If the user builds a customized set, we omit the naming scheme and don't check for compatibility. This avoids checking requirements at runtime, so that logic has been removed as well. This can be used to build GPU runners with no vector flags, or CPU/GPU runners with additional flags (e.g. AVX512) enabled. * Use relative paths If the user checks out the repo in a path that contains spaces, make gets really confused so use relative paths for everything in-repo to avoid breakage. * Remove payloads from main binary * install: clean up prior libraries This removes support for v0.3.6 and older versions (before the tar bundle) and ensures we clean up prior libraries before extracting the bundle(s). Without this change, runners and dependent libraries could leak when we update and lead to subtle runtime errors.
2025-09-27 18:17:07 +02:00 · 2024-12-10 09:47:19 -08:00
parent 63269668c0
commit 4879a234c4
58 changed files with 877 additions and 1168 deletions
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -1,57 +0,0 @@
-# top level makefile for Go server
-include make/common-defs.make
-
-RUNNER_TARGETS := default
-
-# Determine which if any GPU runners we should build
-ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-else ifeq ($(OS),linux)
-	HIP_PATH?=/opt/rocm
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-endif
-
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_LIB_DIR),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: clean-payload .WAIT runners
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	$(MAKE) -f make/Makefile.$@
-
-help-sync apply-patches create-patches sync:
-	$(MAKE) -f make/Makefile.sync $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-	go clean -cache
-
-clean-payload:
-	rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
-
-.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -9,22 +9,24 @@ package llama
 #cgo amd64,avx CXXFLAGS: -mavx
 #cgo amd64,avx2 CFLAGS: -mavx2 -mfma
 #cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma
+#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__
+#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__
 #cgo amd64,f16c CFLAGS: -mf16c
 #cgo amd64,f16c CXXFLAGS: -mf16c
 #cgo amd64,fma CFLAGS: -mfma
 #cgo amd64,fma CXXFLAGS: -mfma
-#cgo avx CFLAGS: -mavx
-#cgo avx CXXFLAGS: -mavx
-#cgo avx2 CFLAGS: -mavx2 -mfma -mf16c
-#cgo avx2 CXXFLAGS: -mavx2 -mfma -mf16c
-#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
-#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
-#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
-#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
+#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5
+#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6
+#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11
+#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
 #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
 #cgo darwin,amd64 LDFLAGS: -framework Foundation
@@ -36,28 +38,24 @@ package llama
 #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
+#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux-amd64
 #cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
+#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux-arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
 #cgo linux,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt -lresolv
-#cgo linux,rocm LDFLAGS: -L/opt/rocm/lib -lpthread -ldl -lrt -lresolv
+#cgo linux,rocm LDFLAGS: -lpthread -ldl -lrt -lresolv
 #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602
 #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602
-#cgo windows LDFLAGS: -lmsvcrt
 #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
+#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows-amd64
 #cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
+#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows-arm64
 #cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt
 #cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas

--- a/llama/make/Makefile.cuda_v11
+++ b/llama/make/Makefile.cuda_v11
@@ -1,12 +0,0 @@
-# Build rules for CUDA v11 runner
-
-include make/common-defs.make
-
-
-GPU_RUNNER_VARIANT := _v11
-GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null)
-GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86
-
-include make/cuda.make
-include make/gpu.make
--- a/llama/make/Makefile.cuda_v12
+++ b/llama/make/Makefile.cuda_v12
@@ -1,12 +0,0 @@
-# Build rules for CUDA v12 runner
-
-include make/common-defs.make
-
-
-GPU_RUNNER_VARIANT := _v12
-GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null)
-GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a
-
-include make/cuda.make
-include make/gpu.make
--- a/llama/make/Makefile.default
+++ b/llama/make/Makefile.default
@@ -1,54 +0,0 @@
-# Build the default runner(s) for the platform which do not rely on 3rd party GPU libraries
-# On Mac arm64, this builds the metal runner
-# On other platforms this builds the CPU runner(s)
-
-include make/common-defs.make
-
-CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
-DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
-RUNNERS := $(DEFAULT_RUNNER)
-ifeq ($(ARCH),amd64)
-ifeq ($(CUSTOM_CPU_FLAGS),)
-	RUNNERS += cpu_avx cpu_avx2
-endif
-endif
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(RUNNERS)))
-endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-
-all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
-$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner
-
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
-
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
-
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	cp $< $@
-
-$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server$(EXE_EXT)
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-
-clean: 
-	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-.PHONY: clean all
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
--- a/llama/make/Makefile.rocm
+++ b/llama/make/Makefile.rocm
@@ -1,109 +0,0 @@
-# Build rules for ROCm runner
-#
-# Note: at present we only support a single ROCm version (whichever is default on the build system)
-# unlike CUDA where we'll build both a v11 and v12 variant.
-
-include make/common-defs.make
-
-HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
-HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
-
-ifeq ($(OS),windows)
-	GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)/bin")
-	CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
-	GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
-else ifeq ($(OS),linux)
-	GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib
-	GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
-	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
-endif
-
-# TODO future multi-variant support for ROCm
-# ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
-# ifneq (,$(ROCM_VERSION))
-# 	GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
-# endif
-
-GPU_RUNNER_GO_TAGS := rocm
-GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
-GPU_RUNNER_LIBS_SHORT := hipblas rocblas
-GPU_PATH_ROOT_WIN=$(dir $(GPU_LIB_DIR_WIN))
-GPU_PATH_ROOT_LINUX=$(dir $(GPU_LIB_DIR_LINUX))
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
-
-GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-ifeq ($(OS),windows)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama
-else ifeq ($(OS),linux)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama
-endif
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS))))
-ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
-
-ifeq ($(OS),linux)
-	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX), --offload-arch=$(arch))
-else ifeq ($(OS),windows)
-	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch))
-endif
-
-GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
-	-mf16c \
-	-mfma \
-	-parallel-jobs=2 \
-	-c \
-	-O3 \
-	-DGGML_USE_CUDA \
-	-DGGML_BUILD=1 \
-	-DGGML_SHARED=1 \
-	-DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_SCHED_MAX_COPIES=4 \
-	-DGGML_USE_HIPBLAS \
-	-DGGML_USE_LLAMAFILE \
-	-DHIP_FAST_MATH \
-	-D__HIP_PLATFORM_AMD__=1 \
-	-D__HIP_ROCclr__=1 \
-	-DNDEBUG \
-	-DK_QUANTS_PER_ITERATION=2 \
-	-D_CRT_SECURE_NO_WARNINGS \
-	-D_GNU_SOURCE \
-	-D_XOPEN_SOURCE=600 \
-	-DUSE_PROF_API=1 \
-	-std=gnu++14 \
-	-x hip \
-	-mllvm=-amdgpu-early-inline-all=true \
-	-mllvm=-amdgpu-function-calls=false \
-	-Wno-expansion-to-defined \
-	-Wno-invalid-noreturn \
-	-Wno-ignored-attributes \
-	-Wno-pass-failed \
-	-Wno-deprecated-declarations \
-	-Wno-unused-result \
-	-I.
-
-# Workaround buggy P2P copy on some windows multi-GPU setups
-# This workaround breaks linux systems with small system RAM, so only enable on windows
-ifeq ($(OS),windows)
-	GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
-endif
-
-include make/gpu.make
-
-# Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
-$(ROCBLAS_DIST_DEP_MANIFEST):
-	@-mkdir -p $(dir $@)
-	@echo "Copying rocblas library..."
-	cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
-	@echo "rocblas library copy complete"
--- a/llama/make/Makefile.sync
+++ b/llama/make/Makefile.sync
@@ -1,191 +0,0 @@
-# Helpers for managing our vendored llama.cpp repo and patch set
-
-REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-
-include $(REPO_ROOT)llama/vendoring
-
-LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
-
-LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
-
-
-help-sync:
-	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
-	@echo ""
-	@echo "\tmake apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
-	@echo "\tmake sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
-	@echo "\tmake create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
-	@echo ""
-	@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md"
-
-apply-patches: $(LLAMACPP_REPO)
-	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
-  		echo "ERROR: Your llama.cpp repo is dirty.  The apply-patches target requires a clean working tree"; \
-		echo "To clobber: git -C $(LLAMACPP_REPO) reset --hard HEAD" ; \
-  		exit 1; \
-	fi
-	@echo "Checking out $(LLAMACPP_BASE_COMMIT)"
-	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
-		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
-	@echo "Applying ollama patches..."
-	@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \
-		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
-	@echo ""
-	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
-	@echo "Don't forget to commit any changes you make and run 'make create-patches' "
-	
-$(LLAMACPP_REPO):
-	@echo "Cloning llama.cpp to $(LLAMACPP_REPO)"
-	git clone https://github.com/ggerganov/llama.cpp.git $@
-
-create-patches: $(LLAMACPP_REPO)
-	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
-  		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
-  		exit 1; \
-	fi
-	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
-
-# Vendoring template logic
-EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
-OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
-define vendor_file
-$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
-ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
-	@echo "vendoring $1"; \
-		mkdir -p $$(dir $$@) && \
-		echo "/**" > $$@ && \
-		echo " * llama.cpp - commit $$(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $$@ && \
-		echo " *" >> $$@ && \
-		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$$$//' >> $$@ && \
-		echo " */" >> $$@ && \
-		echo "" >> $$@ && \
-		cat $$< >> $$@
-else
-	@echo "vendoring $1"; \
-		mkdir -p $$(dir $$@) && \
-		cat $$< > $$@
-endif
-VENDORED_FILES += $(strip $(addprefix $(2),$(notdir $1)))
-endef
-
-# llama.cpp files -> llama/
-LLAMACPP_FILES=\
-	src/unicode.cpp \
-	src/unicode.h \
-	src/unicode-data.cpp \
-	src/unicode-data.h \
-	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	include/llama.h \
-	ggml/src/llamafile/sgemm.cpp \
-	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
-
-# llama.cpp files -> llama/llamafile
-LLAMAFILE_FILES= \
-	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/)))
-
-# ggml files -> llama/
-GGML_FILES= \
-	ggml/src/ggml.c \
-	ggml/include/ggml.h \
-	ggml/src/ggml-quants.c \
-	ggml/src/ggml-quants.h \
-	ggml/src/ggml-metal.metal \
-	ggml/include/ggml-metal.h \
-	ggml/src/ggml-impl.h \
-	ggml/include/ggml-cuda.h \
-	ggml/src/ggml-cuda.cu \
-	ggml/src/ggml-common.h \
-	ggml/include/ggml-backend.h \
-	ggml/src/ggml-backend.c \
-	ggml/src/ggml-backend-impl.h \
-	ggml/include/ggml-alloc.h \
-	ggml/src/ggml-alloc.c \
-	ggml/src/ggml-aarch64.h \
-	ggml/src/ggml-aarch64.c \
-	ggml/src/ggml-cpu-impl.h \
-	ggml/include/ggml-blas.h \
-	ggml/src/ggml-blas.cpp
-$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
-
-# TODO generalize renaming pattern if we have more of these
-$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
-	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
-		mkdir -p $(dir $@) && \
-		echo "/**" > $@ && \
-		echo " * llama.cpp - commit $(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $@ && \
-		echo " *" >> $@ && \
-		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$//' >> $@ && \
-		echo " */" >> $@ && \
-		echo "" >> $@ && \
-		cat $< >> $@
-VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m
-
-# ggml-cuda -> llama/ggml-cuda/
-GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
-GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
-$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/)))
-
-GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
-GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
-$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/)))
-
-GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
-GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
-$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/)))
-
-# llava -> llama/
-LAVA_FILES= \
-	examples/llava/clip.cpp \
-	examples/llava/clip.h \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	common/log.h \
-	common/log.cpp \
-	common/stb_image.h
-# These files are mostly used by the llava code
-# and shouldn't be necessary once we use clip.cpp directly
-LAVA_FILES+= \
-	common/common.cpp \
-	common/common.h \
-	common/sampling.cpp \
-	common/sampling.h \
-	common/json.hpp \
-	common/json-schema-to-grammar.cpp \
-	common/json-schema-to-grammar.h \
-	common/base64.hpp
-$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
-
-$(DST_DIR)build-info.cpp:
-	@echo "Generating $@"
-	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
-	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
-	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
-	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
-VENDORED_FILES += $(DST_DIR)build-info.cpp
-
-
-sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
-
-PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
-NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/
-ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
-EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
-remove-stale-files:
-	@rm -f $(EXTRA_NATIVE_FILES)
-
-.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT 
-
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
--- a/llama/make/common-defs.make
+++ b/llama/make/common-defs.make
@@ -1,84 +0,0 @@
-# Common definitions for the various Makefiles
-# No rules are defined here so this is safe to include at the beginning of other makefiles
-
-OS := $(shell uname -s)
-ARCH ?= $(subst aarch64,arm64,$(subst x86_64,amd64,$(shell uname -m)))
-ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
-	OS := windows
-	ARCH := $(shell systeminfo 2>/dev/null | grep "System Type" | grep ARM64 > /dev/null && echo "arm64" || echo "amd64" )
-else ifeq ($(OS),Linux)
-	OS := linux
-else ifeq ($(OS),Darwin)
-	OS := darwin
-endif
-comma:= ,
-empty:=
-space:= $(empty) $(empty)
-uc = $(subst a,A,$(subst b,B,$(subst c,C,$(subst d,D,$(subst e,E,$(subst f,F,$(subst g,G,$(subst h,H,$(subst i,I,$(subst j,J,$(subst k,K,$(subst l,L,$(subst m,M,$(subst n,N,$(subst o,O,$(subst p,P,$(subst q,Q,$(subst r,R,$(subst s,S,$(subst t,T,$(subst u,U,$(subst v,V,$(subst w,W,$(subst x,X,$(subst y,Y,$(subst z,Z,$1))))))))))))))))))))))))))
-
-export CGO_CFLAGS_ALLOW = -mfma|-mf16c
-export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
-export HIP_PLATFORM = amd
-export CGO_ENABLED=1
-
-SRC_DIR := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
-DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
-DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
-RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
-RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../build/$(OS)/$(ARCH))
-RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
-DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
-GZIP:=$(shell command -v pigz 2>/dev/null || echo "gzip")
-ifneq ($(OS),windows)
-	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
-endif
-VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
-
-# Conditionally enable ccache for cgo builds too
-ifneq ($(CCACHE),)
-	CC=$(CCACHE) gcc
-	CXX=$(CCACHE) g++
-	export CC
-	export CXX
-endif
-
-
-# Override in environment space separated to tune GPU runner CPU vector flags
-ifeq ($(ARCH),amd64)
-	GPU_RUNNER_CPU_FLAGS ?= avx
-endif
-
-ifeq ($(OS),windows)
-	CP := cp
-	SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
-	OBJ_EXT := obj
-	SHARED_EXT := dll
-	EXE_EXT := .exe
-	SHARED_PREFIX := 
-	CPU_FLAG_PREFIX := /arch:
-ifneq ($(HIP_PATH),)
-	# If HIP_PATH has spaces, hipcc trips over them when subprocessing
-	HIP_PATH := $(shell cygpath -m -s "$(patsubst %\,%,$(HIP_PATH))")
-	export HIP_PATH
-endif
-else ifeq ($(OS),linux)
-	CP := cp -af
-	OBJ_EXT := o
-	SHARED_EXT := so
-	SHARED_PREFIX := lib
-	CPU_FLAG_PREFIX := -m
-	HIP_PATH?=/opt/rocm
-else
-	OBJ_EXT := o
-	SHARED_EXT := so
-	CPU_FLAG_PREFIX := -m
-	CP := cp -af
-endif
-
-COMMON_SRCS := \
-	$(wildcard *.c) \
-	$(wildcard *.cpp)
-COMMON_HDRS := \
-	$(wildcard *.h) \
-	$(wildcard *.hpp)
--- a/llama/make/cuda.make
+++ b/llama/make/cuda.make
@@ -1,50 +0,0 @@
-# Common definitions for all cuda versions
-
-ifndef GPU_RUNNER_VARIANT
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-
-GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
-GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
-GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
-GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
-CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
-GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
-GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
-
-ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11
-endif
-GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-	-DGGML_CUDA_USE_GRAPHS=1
-GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \
-	-t2 \
-	-DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-	-DGGML_USE_CUDA=1 \
-	-DGGML_SHARED=1 \
-	-DGGML_BUILD=1 \
-	-DGGML_USE_LLAMAFILE \
-	-DK_QUANTS_PER_ITERATION=2 \
-	-DNDEBUG \
-	-D_GNU_SOURCE \
-	-D_XOPEN_SOURCE=600 \
-	-Wno-deprecated-gpu-targets \
-	--forward-unknown-to-host-compiler \
-	-use_fast_math \
-	-I. \
-	-O3
--- a/llama/make/gpu.make
+++ b/llama/make/gpu.make
@@ -1,122 +0,0 @@
-# Generalized GPU runner build
-
-ifndef GPU_RUNNER_NAME
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-ifeq ($(OS),windows)
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_WIN)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_WIN)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_WIN)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_WIN)
-else ifeq ($(OS),linux)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_LINUX)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_LINUX)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_LINUX)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_LINUX)
-endif
-
-GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
-
-# TODO Unify how we handle dependencies in the dist/packaging and install flow
-# today, cuda is bundled, but rocm is split out.  Should split them each out by runner
-DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
-
-ifeq ($(OS),windows)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
-else ifeq ($(OS),linux)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
-endif
-
-GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
-
-GPU_RUNNER_SRCS := \
-	ggml-cuda.cu \
-	$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \
-	$(wildcard ggml-cuda/template-instances/mmq*.cu) \
-	ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp ggml-aarch64.c
-GPU_RUNNER_HDRS := \
-	$(wildcard ggml-cuda/*.cuh)
-
-
-# Conditional flags and components to speed up developer builds
-ifneq ($(OLLAMA_FAST_BUILD),)
-	GPU_COMPILER_CUFLAGS += 	\
-		-DGGML_DISABLE_FLASH_ATTN
-else
-	GPU_RUNNER_SRCS += \
-		$(wildcard ggml-cuda/fattn*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
-endif
-
-GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
-ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
-endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
-
-
-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-# Build targets
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
-
-# Distribution targets
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
-$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(DIST_GPU_RUNNER_LIB_DEPS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
-$(GPU_DIST_DEPS_LIBS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
-
-# Payload targets
-$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server 
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-
-clean: 
-	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-.PHONY: clean $(GPU_RUNNER_NAME)
-
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
-
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"errors"
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"testing"
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"errors"
--- a/llama/runner/image_test.go
+++ b/llama/runner/image_test.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"reflect"
--- a/llama/runner/requirements.go
+++ b/llama/runner/requirements.go
@@ -1,19 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"os"
-
-	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/version"
-)
-
-func printRequirements(fp *os.File) {
-	attrs := map[string]string{
-		"system_info":  llama.PrintSystemInfo(),
-		"version":      version.Version,
-		"cpu_features": llama.CpuFeatures,
-	}
-	enc := json.NewEncoder(fp)
-	_ = enc.Encode(attrs)
-}
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"context"
@@ -895,32 +895,37 @@ func (s *Server) loadModel(
 	s.ready.Done()
 }

-func main() {
-	mpath := flag.String("model", "", "Path to model binary file")
-	ppath := flag.String("mmproj", "", "Path to projector binary file")
-	parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
-	batchSize := flag.Int("batch-size", 512, "Batch size")
-	nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
-	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
-	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
-	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
-	kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
-	port := flag.Int("port", 8080, "Port to expose the server on")
-	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
-	noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
-	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
-	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	requirements := flag.Bool("requirements", false, "print json requirement information")
+func Execute(args []string) error {
+	if args[0] == "runner" {
+		args = args[1:]
+	}
+	fs := flag.NewFlagSet("runner", flag.ExitOnError)
+	mpath := fs.String("model", "", "Path to model binary file")
+	ppath := fs.String("mmproj", "", "Path to projector binary file")
+	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
+	batchSize := fs.Int("batch-size", 512, "Batch size")
+	nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
+	mainGpu := fs.Int("main-gpu", 0, "Main GPU")
+	flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
+	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
+	port := fs.Int("port", 8080, "Port to expose the server on")
+	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
+	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
+	mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
+	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
+	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")

 	var lpaths multiLPath
-	flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
+	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")

-	flag.Parse()
-	if *requirements {
-		printRequirements(os.Stdout)
-		return
+	fs.Usage = func() {
+		fmt.Fprintf(fs.Output(), "Runner usage\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
 	}
 	level := slog.LevelInfo
 	if *verbose {
@@ -983,7 +988,8 @@ func main() {
 	listener, err := net.Listen("tcp", addr)
 	if err != nil {
 		fmt.Println("Listen error:", err)
-		return
+		cancel()
+		return err
 	}
 	defer listener.Close()

@@ -999,7 +1005,9 @@ func main() {
 	log.Println("Server listening on", addr)
 	if err := httpServer.Serve(listener); err != nil {
 		log.Fatal("server error:", err)
+		return err
 	}

 	cancel()
+	return nil
 }
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"strings"
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -1,4 +1,4 @@
-package main
+package runner

 import (
 	"reflect"