diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 000000000..dd6926df8 --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,35 @@ +# Dockerfile variant to ensure we can build CPU only on linux +FROM --platform=linux/amd64 ubuntu:20.04 AS base-cpu-amd64 +ENV CMAKE_ARCH "x86_64" + +FROM --platform=linux/arm64 ubuntu:20.04 AS base-cpu-arm64 +ENV CMAKE_ARCH "aarch64" + +FROM base-cpu-${TARGETARCH} AS cpu-builder +ARG TARGETARCH +ARG GOFLAGS +ARG CGO_CFLAGS + +# Common toolchain +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y wget make gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 +RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-${CMAKE_ARCH}.sh" -O /tmp/cmake-installer.sh && \ + chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr + +# install go +ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz +RUN mkdir -p /usr/local && tar xz -C /usr/local err = NULL; const int buflen = 256; char buf[buflen + 1]; @@ -56,6 +57,13 @@ void cuda_init(cuda_init_resp_t *resp) { return; } } + + ret = (*resp->ch.initFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram init failure: %d", ret); + resp->err = strdup(buf); + } + return; } @@ -73,17 +81,9 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { return; } - ret = (*h.initFn)(); - if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "nvml vram init failure: %d", ret); - resp->err = strdup(buf); - return; - } - // TODO - handle multiple GPUs ret = (*h.getHandle)(0, &device); if (ret != NVML_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "unable to get device handle: %d", ret); resp->err = strdup(buf); return; @@ -91,20 +91,12 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { ret = (*h.getMemInfo)(device, &memInfo); if (ret != NVML_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "device memory info lookup failure: %d", ret); resp->err = strdup(buf); return; } resp->total = memInfo.total; resp->free = memInfo.free; - - ret = (*h.shutdownFn)(); - if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret); - resp->err = strdup(buf); - } - return; } #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 88bd2dad7..e69d5cbac 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -20,6 +20,7 @@ const char *rocm_lib_paths[] = { #endif void rocm_init(rocm_init_resp_t *resp) { + rsmi_status_t ret; resp->err = NULL; const int buflen = 256; char buf[buflen + 1]; @@ -56,6 +57,13 @@ void rocm_init(rocm_init_resp_t *resp) { return; } } + + ret = (*resp->rh.initFn)(0); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm vram init failure: %d", ret); + resp->err = strdup(buf); + } + return; } @@ -70,10 +78,8 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { char buf[buflen + 1]; int i; - ret = (*h.initFn)(0); - if (ret != RSMI_STATUS_SUCCESS) { - snprintf(buf, buflen, "rocm vram init failure: %d", ret); - resp->err = strdup(buf); + if (h.handle == NULL) { + resp->err = strdup("nvml handle sn't initialized"); return; } @@ -89,20 +95,17 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { // Get total memory - used memory for available memory ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem); if (ret != RSMI_STATUS_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); resp->err = strdup(buf); return; } ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem); if (ret != RSMI_STATUS_SUCCESS) { - (*h.shutdownFn)(); snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); resp->err = strdup(buf); return; } - (*h.shutdownFn)(); resp->total = totalMem; resp->free = totalMem - usedMem; return; diff --git a/llm/ext_server.go b/llm/ext_server.go index ded424a9b..5fcd8e923 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -21,17 +21,7 @@ package llm #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a - -// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a -#cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin #cgo windows LDFLAGS: -lext_server_shared -lpthread diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index 93c998f47..3608ddd6e 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -13,28 +13,43 @@ source $(dirname $0)/gen_common.sh init_vars git_module_setup apply_patches -CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +if [ -d /usr/local/cuda/lib64/ ] ; then + CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +else + CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +fi BUILD_DIR="gguf/build/cuda" LIB_DIR="${BUILD_DIR}/lib" mkdir -p ../../dist/ build -# TODO - explore mechanism to soften the hard cuda dependency on linux -# by conditionally building some archive here that aggregates the cuda libs if present -# so that the cgo flags link this intermediate archive instead of the underlying cuda libs -# -# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \ -# -Wl,--whole-archive \ -# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \ -# ${BUILD_DIR}/common/libcommon.a \ -# ${BUILD_DIR}/libllama.a \ -# ${BUILD_DIR}/examples/llava/libllava_static.a \ -# -Wl,--no-whole-archive \ -# -lrt -lpthread -ldl -lstdc++ -lm \ -# /usr/local/cuda/lib64/libcudart_static.a \ -# /usr/local/cuda/lib64/libcublas_static.a \ -# /usr/local/cuda/lib64/libcublasLt_static.a \ -# /usr/local/cuda/lib64/libcudadevrt.a \ -# /usr/local/cuda/lib64/libculibos.a + +if [ -d /usr/local/cuda/lib64/ ] ; then + pwd + ar -M <