From e91ae3d47d8153c4b7c10dba031b77d7ae408ef0 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 25 Feb 2025 13:47:36 -0800
Subject: [PATCH] Update ROCm (6.3 linux, 6.2 windows) and CUDA v12.8 (#9304)

* Bump cuda and rocm versions

Update ROCm to linux:6.3 win:6.2 and CUDA v12 to 12.8.
Yum has some silent failure modes, so largely switch to dnf.

* Fix windows build script
---
 .github/workflows/release.yaml |  8 +--
 .github/workflows/test.yaml    |  2 +-
 Dockerfile                     | 28 +++++-----
 scripts/build_docker.sh        |  2 +-
 scripts/build_linux.sh         | 34 ++++++++++--
 scripts/build_windows.ps1      | 94 +++++++++++++++++++---------------
 6 files changed, 104 insertions(+), 64 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 37d525e91..12f361408 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -111,13 +111,13 @@ jobs:
           - os: windows
             arch: amd64
             preset: 'CUDA 12'
-            install: https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe
-            cuda-version: '12.4'
+            install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe
+            cuda-version: '12.8'
           - os: windows
             arch: amd64
             preset: 'ROCm 6'
-            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
-            rocm-version: '6.1'
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
+            rocm-version: '6.2'
     runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
     environment: release
     env:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 56a2cc4fd..431bc3282 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -81,7 +81,7 @@ jobs:
             install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
             flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
           - preset: ROCm
-            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe
+            install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
             flags: '-DAMDGPU_TARGETS=gfx1010'
     runs-on: windows
     steps:
diff --git a/Dockerfile b/Dockerfile
index 09612824b..46d4713e7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,20 +4,22 @@ ARG FLAVOR=${TARGETARCH}
 
 ARG ROCMVERSION=6.3.3
 ARG JETPACK5VERSION=r35.4.1
-ARG JETPACK6VERSION=r36.2.0
+ARG JETPACK6VERSION=r36.4.0
 ARG CMAKEVERSION=3.31.2
 
+# CUDA v11 requires gcc v10.  v10.3 has regressions, so the rockylinux 8.5 AppStream has the latest compatible version
 FROM --platform=linux/amd64 rocm/dev-almalinux-8:${ROCMVERSION}-complete AS base-amd64
-RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
-    && yum install -y yum-utils gcc-toolset-11-gcc gcc-toolset-11-gcc-c++ \
-    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \
-    && curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /usr/local/bin --strip-components 1
-ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
+RUN yum install -y yum-utils \
+    && yum-config-manager --add-repo https://dl.rockylinux.org/vault/rocky/8.5/AppStream/\$basearch/os/ \
+    && rpm --import https://dl.rockylinux.org/pub/rocky/RPM-GPG-KEY-Rocky-8 \
+    && dnf install -y yum-utils ccache gcc-toolset-10-gcc-10.2.1-8.2.el8 gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
+ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
 
-FROM --platform=linux/arm64 rockylinux:8 AS base-arm64
+FROM --platform=linux/arm64 almalinux:8 AS base-arm64
 # install epel-release for ccache
 RUN yum install -y yum-utils epel-release \
-    && yum install -y clang ccache \
+    && dnf install -y clang ccache \
     && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
 ENV CC=clang CXX=clang++
 
@@ -29,7 +31,8 @@ COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
 ENV LDFLAGS=-s
 
 FROM base AS cpu
-# amd64 uses gcc which requires gcc-toolset-11 for AVX extensions while arm64 uses clang
+RUN dnf install -y gcc-toolset-11-gcc gcc-toolset-11-gcc-c++
+ENV PATH=/opt/rh/gcc-toolset-11/root/usr/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
     cmake --preset 'CPU' \
         && cmake --build --parallel --preset 'CPU' \
@@ -37,7 +40,7 @@ RUN --mount=type=cache,target=/root/.ccache \
 
 FROM base AS cuda-11
 ARG CUDA11VERSION=11.3
-RUN yum install -y cuda-toolkit-${CUDA11VERSION//./-}
+RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-}
 ENV PATH=/usr/local/cuda-11/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
     cmake --preset 'CUDA 11' \
@@ -45,8 +48,8 @@ RUN --mount=type=cache,target=/root/.ccache \
         && cmake --install build --component CUDA --strip --parallel 8
 
 FROM base AS cuda-12
-ARG CUDA12VERSION=12.4
-RUN yum install -y cuda-toolkit-${CUDA12VERSION//./-}
+ARG CUDA12VERSION=12.8
+RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-}
 ENV PATH=/usr/local/cuda-12/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
     cmake --preset 'CUDA 12' \
@@ -54,6 +57,7 @@ RUN --mount=type=cache,target=/root/.ccache \
         && cmake --install build --component CUDA --strip --parallel 8
 
 FROM base AS rocm-6
+ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH
 RUN --mount=type=cache,target=/root/.ccache \
     cmake --preset 'ROCm 6' \
         && cmake --build --parallel --preset 'ROCm 6' \
diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh
index 567eb7c7a..1dd8d1f68 100755
--- a/scripts/build_docker.sh
+++ b/scripts/build_docker.sh
@@ -28,7 +28,7 @@ if echo $PLATFORM | grep "amd64" > /dev/null; then
         ${LOAD_OR_PUSH} \
         --platform=linux/amd64 \
         ${OLLAMA_COMMON_BUILD_ARGS} \
-        --target runtime-rocm \
+        --build-arg FLAVOR=rocm \
         -f Dockerfile \
         -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
         .
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index a0c3d2f00..618722d11 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -22,8 +22,34 @@ docker buildx build \
         -f Dockerfile \
         .
 
-# buildx behavior changes for single vs. multiplatform
-if echo $PLATFORM | grep "," > /dev/null ; then 
-        mv -f ./dist/linux_*64/ollama* ./dist/
-        rmdir ./dist/linux_*64
+if echo $PLATFORM | grep "amd64" > /dev/null; then
+    outDir="./dist"
+    if echo $PLATFORM | grep "," > /dev/null ; then
+       outDir="./dist/linux_amd64"
+    fi
+    docker buildx build \
+        --output type=local,dest=${outDir} \
+        --platform=linux/amd64 \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
+        --build-arg FLAVOR=rocm \
+        --target archive \
+        -f Dockerfile \
+        .
+fi
+
+# buildx behavior changes for single vs. multiplatform
+echo "Compressing linux tar bundles..."
+if echo $PLATFORM | grep "," > /dev/null ; then
+        tar c -C ./dist/linux_arm64 --exclude cuda_jetpack5 --exclude cuda_jetpack6 . | pigz -9vc >./dist/ollama-linux-arm64.tgz
+        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack5  | pigz -9vc >./dist/ollama-linux-arm64-jetpack5.tgz
+        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack6  | pigz -9vc >./dist/ollama-linux-arm64-jetpack6.tgz
+        tar c -C ./dist/linux_amd64 --exclude rocm . | pigz -9vc >./dist/ollama-linux-amd64.tgz
+        tar c -C ./dist/linux_amd64 ./lib/ollama/rocm  | pigz -9vc >./dist/ollama-linux-amd64-rocm.tgz
+elif echo $PLATFORM | grep "arm64" > /dev/null ; then
+        tar c -C ./dist/ --exclude cuda_jetpack5 --exclude cuda_jetpack6 bin lib | pigz -9vc >./dist/ollama-linux-arm64.tgz
+        tar c -C ./dist/ ./lib/ollama/cuda_jetpack5  | pigz -9vc >./dist/ollama-linux-arm64-jetpack5.tgz
+        tar c -C ./dist/ ./lib/ollama/cuda_jetpack6  | pigz -9vc >./dist/ollama-linux-arm64-jetpack6.tgz
+elif echo $PLATFORM | grep "amd64" > /dev/null ; then
+        tar c -C ./dist/ --exclude rocm bin lib | pigz -9vc >./dist/ollama-linux-amd64.tgz
+        tar c -C ./dist/ ./lib/ollama/rocm  | pigz -9vc >./dist/ollama-linux-amd64-rocm.tgz
 fi
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 68f3b11d4..465cc5518 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -26,6 +26,9 @@ function checkEnv() {
         $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
+    if (-Not (get-command -ErrorAction silent ninja)) {
+        $script:NINJA_DIR=(gci -path (Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation -r -fi ninja.exe) | split-path -parent
+    }
     # Locate CUDA versions
     # Note: this assumes every version found will be built
     $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
@@ -75,6 +78,7 @@ function checkEnv() {
     } else {
         write-host "Code signing disabled - please set KEY_CONTAINERS to sign and copy ollama_inc.crt to the top of the source tree"
     }
+    $script:JOBS=((Get-CimInstance Win32_ComputerSystem).NumberOfLogicalProcessors)
 }
 
 
@@ -83,51 +87,57 @@ function buildOllama() {
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
         New-Item "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ItemType Directory -ea 0
 
-
-        # Default first, then conditionall ROCm and cuda v11
-        write-host "Building Default native backend libraries"
-         $env:CMAKE_GENERATOR="ninja"
-        & cmake --preset Default
+        & cmake --fresh --preset CPU --install-prefix $script:DIST_DIR
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        & cmake --build --preset Default -j 12
+        & cmake --build --preset CPU --parallel $script:JOBS
+        if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        & cmake --install build --component CPU --strip
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        & cmake --install build -j 12
-        
-        # TODO - add steps for v11 and ROCm
-        #
-        # if ("$script:CUDA_DIRS".Contains("v11") -and "$script:CUDA_DIRS".Contains("v12")) {
-        #     # We assume the default is v12, so override for v11
-        #     $origCUDA_PATH=$env:CUDA_PATH
-        #     $hashEnv = @{}
-        #     Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
-        #     $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
-        #     write-host "$v11"
-        #     # $env:CUDA_PATH=$hashEnv[$v11]
-        #     # $env:CUDACXX=$hashEnv[$v11]+"\bin\nvcc.exe"
-        #     $env:CUDAToolkit_ROOT=$hashEnv[$v11]
-        #     # ls env:
-        #     write-host "Building CUDA v11 backend libraries"
-        #     & cmake --preset "CUDA 11"
-        #     $env:CUDA_PATH=$origCUDA_PATH
-        #     exit(1)
-        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        #     # & cmake --build --preset "CUDA 11" -j 12
-        #     # if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        # }
 
-        # if ($env:HIP_PATH) {
-        #     write-host "Building ROCm backend libraries"
-        #     $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
-        #     $env:HIP_PLATFORM="amd"
-        #     $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-        #     & cmake --preset "ROCm"
-        #     $env:HIPCXX=""
-        #     $env:HIP_PLATFORM=""
-        #     $env:CMAKE_PREFIX_PATH=""
-        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        #     & cmake --build --preset "ROCm" -j 12
-        #     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-        # }
+        $hashEnv = @{}
+        Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value }
+        if ("$script:CUDA_DIRS".Contains("v11")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $v11="$_" }}
+            $env:CUDAToolkit_ROOT=$hashEnv[$v11]
+            write-host "Building CUDA v11 backend libraries"
+            # Note: cuda v11 requires msvc 2019 so force the older generator
+            # to avoid 2022 (or newer) from being used as the default
+            & cmake --fresh --preset "CUDA 11" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --build --preset "CUDA 11" --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "CUDA" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
+        if ("$script:CUDA_DIRS".Contains("v12")) {
+            $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }}
+            $env:CUDAToolkit_ROOT=$hashEnv[$v12]
+            write-host "Building CUDA v12 backend libraries"
+            & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --build --preset "CUDA 12" --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "CUDA" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
+        if ($env:HIP_PATH) {
+            write-host "Building ROCm backend libraries"
+            if ($null -ne $script:NINJA_DIR) {
+                $env:PATH="$script:NINJA_DIR;$env:PATH"
+            }
+            $env:HIPCXX="${env:HIP_PATH}\bin\clang++.exe"
+            $env:HIP_PLATFORM="amd"
+            $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+            & cmake --fresh --preset "ROCm 6" -G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ --install-prefix $script:DIST_DIR
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            $env:HIPCXX=""
+            $env:HIP_PLATFORM=""
+            $env:CMAKE_PREFIX_PATH=""
+            & cmake --build --preset "ROCm" --parallel $script:JOBS
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build --component "HIP" --strip
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+        }
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
     }