From 9ad2e7dfd2215cb89c80b8e91a1514b3e8353391 Mon Sep 17 00:00:00 2001
From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Date: Sat, 4 Jan 2025 16:38:35 -0800
Subject: [PATCH 1/5] Remove obsolete HIP workaround

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c180adc84b861..feafb93a84e3b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -119,13 +119,6 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 }
 
 static ggml_cuda_device_info ggml_cuda_init() {
-#ifdef __HIP_PLATFORM_AMD__
-    // Workaround for a rocBLAS bug when using multiple graphics cards:
-    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-    rocblas_initialize();
-    CUDA_CHECK(cudaDeviceSynchronize());
-#endif
-
     ggml_cuda_device_info info = {};
 
     cudaError_t err = cudaGetDeviceCount(&info.device_count);

From 7aba1f9cf3f0f2b6f6bc8c6a9fda69c5b1b45b97 Mon Sep 17 00:00:00 2001
From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Date: Sun, 5 Jan 2025 19:51:00 -0800
Subject: [PATCH 2/5] Remove more references to rocBLAS

---
 .devops/nix/package.nix          | 1 -
 .devops/rocm.Dockerfile          | 2 +-
 .github/workflows/build.yml      | 5 +----
 Makefile                         | 2 +-
 cmake/llama-config.cmake.in      | 3 +--
 ggml/src/ggml-cuda/vendors/hip.h | 4 ----
 ggml/src/ggml-hip/CMakeLists.txt | 3 +--
 7 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 043c4364b956a..8558ef9d813a5 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -93,7 +93,6 @@ let
   rocmBuildInputs = with rocmPackages; [
     clr
     hipblas
-    rocblas
   ];
 
   vulkanBuildInputs = [
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index a8088ea00da5b..2be58f3d6e11b 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
+# This is mostly tied to HIP supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 602cf5220e483..2128eab4fb200 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -344,7 +344,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
+          sudo apt-get install -y build-essential git cmake hipblas-dev
 
       - name: Build with native CMake HIP support
         id: cmake_build
@@ -1105,10 +1105,7 @@ jobs:
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
           cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
 
       - name: Determine tag name
         id: tag
diff --git a/Makefile b/Makefile
index 19ae0d5f1c87b..435241ea886c5 100644
--- a/Makefile
+++ b/Makefile
@@ -781,7 +781,7 @@ endif # GGML_HIP_UMA
 
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
-	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+	MK_LDFLAGS += -lhipblas -lamdhip64
 
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 
diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in
index 5c55bc6b822a6..72b263ed60f55 100644
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -140,8 +140,7 @@ if (NOT LLAMA_SHARED_LIB)
     if (GGML_HIP)
         find_package(hip     REQUIRED)
         find_package(hipblas REQUIRED)
-        find_package(rocblas REQUIRED)
-        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
+        list(APPEND _llama_link_deps hip::host roc::hipblas)
     endif()
 
     if (GGML_SYCL)
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 3205534d66f10..5a5d1eeb16ae6 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -3,10 +3,6 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index b15fbd24d6b36..736c0049cf9b0 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -38,7 +38,6 @@ endif()
 
 find_package(hip     REQUIRED)
 find_package(hipblas REQUIRED)
-find_package(rocblas REQUIRED)
 
 message(STATUS "HIP and hipBLAS found")
 
@@ -101,4 +100,4 @@ if (GGML_STATIC)
     message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
 endif()
 
-target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
+target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::hipblas)

From cbf779c45032b8e67827fa8333b72b6b0267c8d1 Mon Sep 17 00:00:00 2001
From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Date: Sat, 25 Jan 2025 17:17:21 -0800
Subject: [PATCH 3/5] Temporarily add logging of free device memory at the end
 of main

---
 examples/main/main.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index da2a03ab9ba10..e96f827f7f3c7 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -3,6 +3,10 @@
 #include "console.h"
 #include "log.h"
 #include "sampling.h"
+// vvv REMOVE BEFORE MERGING
+#include "llama-model.h"
+#include "llama-impl.h"
+// ^^^ REMOVE BEFORE MERGING
 #include "llama.h"
 #include "chat-template.hpp"
 
@@ -912,6 +916,13 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n\n");
+    // vvv REMOVE BEFORE MERGING
+    for (auto * dev : model->devices) {
+        size_t free, total; // NOLINT
+        ggml_backend_dev_memory(dev, &free, &total);
+        LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+    }
+    // ^^^ REMOVE BEFORE MERGING
     common_perf_print(ctx, smpl);
 
     common_sampler_free(smpl);

From bb37819954c472297673ce834513890aad024e55 Mon Sep 17 00:00:00 2001
From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Date: Sun, 26 Jan 2025 14:44:04 -0800
Subject: [PATCH 4/5] Address PR feedback

---
 .devops/nix/package.nix          |  1 +
 .devops/rocm.Dockerfile          |  2 +-
 .github/workflows/build.yml      |  5 ++++-
 Makefile                         |  2 +-
 examples/main/main.cpp           | 11 -----------
 ggml/src/ggml-cuda/ggml-cuda.cu  | 14 ++++++++++++++
 ggml/src/ggml-cuda/vendors/hip.h |  4 ++++
 ggml/src/ggml-hip/CMakeLists.txt |  3 ++-
 8 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
index 8558ef9d813a5..043c4364b956a 100644
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -93,6 +93,7 @@ let
   rocmBuildInputs = with rocmPackages; [
     clr
     hipblas
+    rocblas
   ];
 
   vulkanBuildInputs = [
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 2be58f3d6e11b..a8088ea00da5b 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to HIP supported archs.
+# This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7dd4111aef0ba..cd8422f8a266c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -361,7 +361,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install -y build-essential git cmake hipblas-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
 
       - name: Build with native CMake HIP support
         id: cmake_build
@@ -1125,7 +1125,10 @@ jobs:
             -DGGML_HIP=ON `
             -DGGML_RPC=ON
           cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
 
       - name: Determine tag name
         id: tag
diff --git a/Makefile b/Makefile
index 1c9b92bf3832c..295522ba356b4 100644
--- a/Makefile
+++ b/Makefile
@@ -781,7 +781,7 @@ endif # GGML_HIP_UMA
 
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
-	MK_LDFLAGS += -lhipblas -lamdhip64
+	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
 
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e96f827f7f3c7..da2a03ab9ba10 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -3,10 +3,6 @@
 #include "console.h"
 #include "log.h"
 #include "sampling.h"
-// vvv REMOVE BEFORE MERGING
-#include "llama-model.h"
-#include "llama-impl.h"
-// ^^^ REMOVE BEFORE MERGING
 #include "llama.h"
 #include "chat-template.hpp"
 
@@ -916,13 +912,6 @@ int main(int argc, char ** argv) {
     }
 
     LOG("\n\n");
-    // vvv REMOVE BEFORE MERGING
-    for (auto * dev : model->devices) {
-        size_t free, total; // NOLINT
-        ggml_backend_dev_memory(dev, &free, &total);
-        LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
-    }
-    // ^^^ REMOVE BEFORE MERGING
     common_perf_print(ctx, smpl);
 
     common_sampler_free(smpl);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 29d0177c8bc7c..c895381ecae11 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -120,6 +120,20 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 }
 
 static ggml_cuda_device_info ggml_cuda_init() {
+#ifdef __HIP_PLATFORM_AMD__
+    // Workaround for a rocBLAS bug when using multiple graphics cards:
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+    {
+        char version_string[64];
+        version_string[0] = '\0';
+        const rocblas_status status = rocblas_get_version_string(version_string, sizeof(version_string));
+        if (status != rocblas_status_success || version_string[0] < '4') {
+            rocblas_initialize();
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+#endif
+
     ggml_cuda_device_info info = {};
 
     cudaError_t err = cudaGetDeviceCount(&info.device_count);
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index ef7cd58208b86..8594093f052ef 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -4,6 +4,10 @@
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bfloat16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt
index 46434f79091c8..ecc3bc66d44c0 100644
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -38,6 +38,7 @@ endif()
 
 find_package(hip     REQUIRED)
 find_package(hipblas REQUIRED)
+find_package(rocblas REQUIRED)
 
 message(STATUS "HIP and hipBLAS found")
 
@@ -110,4 +111,4 @@ if (GGML_STATIC)
     message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
 endif()
 
-target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::hipblas)
+target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)

From 61d341f8180071177595de39bf9bb23edc85c75b Mon Sep 17 00:00:00 2001
From: Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Date: Mon, 27 Jan 2025 21:16:54 -0800
Subject: [PATCH 5/5] Address code review feedback

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index c895381ecae11..a0d6a54969c3d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -42,6 +42,7 @@
 #include <algorithm>
 #include <array>
 #include <atomic>
+#include <charconv>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
@@ -124,10 +125,20 @@ static ggml_cuda_device_info ggml_cuda_init() {
     // Workaround for a rocBLAS bug when using multiple graphics cards:
     // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
     {
-        char version_string[64];
-        version_string[0] = '\0';
-        const rocblas_status status = rocblas_get_version_string(version_string, sizeof(version_string));
-        if (status != rocblas_status_success || version_string[0] < '4') {
+        int major_version = 0;
+        size_t version_length = 0;
+        if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
+            std::string version(version_length, '\0');
+            if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
+                version.resize(::strlen(version.c_str()));
+                int parsed_value = 0;
+                if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
+                    major_version = parsed_value;
+                }
+            }
+        }
+        if (major_version < 4) {
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
             rocblas_initialize();
             CUDA_CHECK(cudaDeviceSynchronize());
         }