From 22d51d4035fda968a9b3498a557052141b6b4634 Mon Sep 17 00:00:00 2001
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
Date: Mon, 25 Nov 2024 18:47:58 +0800
Subject: [PATCH 1/4] Fix typo

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 docker/cuda_simple/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile
index 0bbf20ffe..eb55e89d2 100644
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@@ -17,7 +17,7 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=all
 ENV GGML_CUDA=1
 
-# Install depencencies
+# Install dependencies
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
 
 # Install llama-cpp-python (build with cuda)

From d4558170fe731571473220fe00d31bf58e30b330 Mon Sep 17 00:00:00 2001
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
Date: Mon, 25 Nov 2024 18:49:52 +0800
Subject: [PATCH 2/4] Add musa_simple Dockerfile for supporting Moore Threads
 GPU

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 docker/README.md              | 23 +++++++++++++++++++----
 docker/musa_simple/Dockerfile | 27 +++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 docker/musa_simple/Dockerfile

diff --git a/docker/README.md b/docker/README.md
index 474503fdf..52544d42b 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,5 +1,5 @@
 ### Install Docker Server
-> [!IMPORTANT]  
+> [!IMPORTANT]
 > This was tested with Docker running on Linux. <br>If you can get it working on Windows or MacOS, please update this `README.md` with a PR!<br>
 
 [Install Docker Engine](https://docs.docker.com/engine/install)
@@ -16,7 +16,7 @@ docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path
 where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
 
 ### cuda_simple
-> [!WARNING]  
+> [!WARNING]
 > Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) <br>
 
 A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image:
@@ -30,6 +30,21 @@ where `<model-root-path>/<model-path>` is the full path to the model file on the
 
 --------------------------------------------------------------------------
 
+### musa_simple
+> [!WARNING]
+> Moore Threads GPU MuBLAS support requires an MTT GPU with sufficient VRAM (approximately as much as the size in the table below) and MT CloudNative Toolkits support (see [download](https://developer.mthreads.com/sdk/download/CloudNative)) <br>
+
+A simple Dockerfile for MUSA-accelerated MuBLAS, where the model is located outside the Docker image:
+
+```
+cd ./musa_simple
+docker build -t musa_simple .
+docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t musa_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+
+--------------------------------------------------------------------------
+
 ### "Open-Llama-in-a-box"
 Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server:
 ```
@@ -47,7 +62,7 @@ docker $ ls -lh *.bin
 lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
 ```
 
-> [!NOTE]  
+> [!NOTE]
 > Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
 **TWICE** as much disk space as the size of the model:<br>
 
@@ -60,5 +75,5 @@ lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_
 |   65B |           50 GB |
 
 
-> [!NOTE]  
+> [!NOTE]
 > If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
diff --git a/docker/musa_simple/Dockerfile b/docker/musa_simple/Dockerfile
new file mode 100644
index 000000000..2856ae5f3
--- /dev/null
+++ b/docker/musa_simple/Dockerfile
@@ -0,0 +1,27 @@
+ARG MUSA_IMAGE="rc3.1.0-devel-ubuntu22.04"
+FROM mthreads/musa:${MUSA_IMAGE}
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && cp /driver/etc/OpenCL/vendors/MT.icd /etc/OpenCL/vendors/MT.icd
+
+COPY . .
+
+# setting build related env vars
+ENV MUSA_DOCKER_ARCH=all
+ENV GGML_MUSA=1
+
+# Install dependencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
+
+# Install llama-cpp-python (build with musa)
+RUN CMAKE_ARGS="-DGGML_MUSA=on" pip install llama-cpp-python
+
+# Run the server
+CMD python3 -m llama_cpp.server

From 0a8b764d39e8c549a29c08f7763aadb4fc971c33 Mon Sep 17 00:00:00 2001
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
Date: Tue, 26 Nov 2024 18:26:54 +0800
Subject: [PATCH 3/4] README.md: Add MUSA as supported backend

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index dbaec5077..0dde5f5e9 100644
--- a/README.md
+++ b/README.md
@@ -200,7 +200,7 @@ CMAKE_ARGS="-DGGML_VULKAN=on" pip install llama-cpp-python
 To install with SYCL support, set the `GGML_SYCL=on` environment variable before installing:
 
 ```bash
-source /opt/intel/oneapi/setvars.sh   
+source /opt/intel/oneapi/setvars.sh
 CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
 ```
 </details>
@@ -211,11 +211,20 @@ CMAKE_ARGS="-DGGML_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip
 To install with RPC support, set the `GGML_RPC=on` environment variable before installing:
 
 ```bash
-source /opt/intel/oneapi/setvars.sh   
+source /opt/intel/oneapi/setvars.sh
 CMAKE_ARGS="-DGGML_RPC=on" pip install llama-cpp-python
 ```
 </details>
 
+<details>
+<summary>MUSA</summary>
+
+To install with MUSA support, set the `GGML_MUSA=on` environment variable before installing:
+
+```bash
+CMAKE_ARGS="-DGGML_MUSA=on" pip install llama-cpp-python
+```
+</details>
 
 ### Windows Notes
 

From b69902db6e9ec46b3d5b68d6efa32a6cd3575044 Mon Sep 17 00:00:00 2001
From: Xiaodong Ye <xiaodong.ye@mthreads.com>
Date: Tue, 26 Nov 2024 21:07:43 +0800
Subject: [PATCH 4/4] Set MUSA_DOCKER_ARCH=default

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 docker/musa_simple/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/musa_simple/Dockerfile b/docker/musa_simple/Dockerfile
index 2856ae5f3..44d6fa435 100644
--- a/docker/musa_simple/Dockerfile
+++ b/docker/musa_simple/Dockerfile
@@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y \
 COPY . .
 
 # setting build related env vars
-ENV MUSA_DOCKER_ARCH=all
+ENV MUSA_DOCKER_ARCH=default
 ENV GGML_MUSA=1
 
 # Install dependencies