From 97c0138d551655a028beeff80d6cf081d9cccbb7 Mon Sep 17 00:00:00 2001
From: Vishal Chourasia <vishalc@linux.ibm.com>
Date: Tue, 6 May 2025 09:28:09 +0530
Subject: [PATCH] Move page cache via mbind to prevent cross-NUMA access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

page cache pages are retained in memory of the node after running
llama-bench bound to a node on multi-node systems, incuring cross-NUMA
memory access penalty for subsequent runs of llama-bench bound to a
different node. This commit introduces an mbind call as best effort
basis to move the pages to the target node where llama-bench is
executed, ensuring optimal NUMA locality. Additionally, necessary NUMA
headers are included and the build is updated to link against the NUMA
library.

Experiments:
1. Run llama-bench on node 1  (base)
2. Run llama-bench on node 0  (regression observed)
3. Run patched llama-bench on node 0 (throughput same as base)

`+ /usr/bin/time -p numactl -N 1 -m 1 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24`
| model                          |       size |     params | backend    | threads | n_batch |          test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         pp512 |          5.39 ± 0.01 |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         tg128 |          5.49 ± 0.03 |

build: 35782aee (5014)
real 687.60
user 15653.73
sys 42.67

`+ /usr/bin/time -p numactl -N 0 -m 0 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24`
| model                          |       size |     params | backend    | threads | n_batch |          test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         pp512 |          4.60 ± 0.01 |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         tg128 |          4.67 ± 0.03 |

build: 35782aee (5014)
real 805.99
user 18187.26
sys 48.93

`+ /usr/bin/time -p numactl -N 0 -m 0 $patched-llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24`
| model                          |       size |     params | backend    | threads | n_batch |          test |                  t/s |
| ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         pp512 |          5.35 ± 0.01 |
| llama 7B Q8_0                  |   6.67 GiB |     6.74 B | CPU        |      24 |       1 |         tg128 |          5.46 ± 0.02 |

build: 35782aee (5014)
real 696.12
user 15735.41
sys 44.08

Suggested-by:  Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
---
 CMakeLists.txt     |  1 +
 src/CMakeLists.txt | 13 +++++++++++++
 src/llama-mmap.cpp | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f73470dffd106..f3eb3ce14d11a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,6 +83,7 @@ option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 
 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_NUMA       "llama: use libnuma to get memory policy of the llama-bench" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
 
 # Required for relocatable CMake package
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 70be604e4b0d3..ca92b714750a8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -42,6 +42,19 @@ target_compile_features   (llama PRIVATE cxx_std_17) # don't bump
 
 target_link_libraries(llama PUBLIC ggml)
 
+if (LLAMA_NUMA)
+    find_library(NUMA_LIB numa)
+    check_include_file_cxx("numa.h" HAVE_NUMA_HEADERS)
+    check_include_file_cxx("numaif.h" HAVE_NUMAIF_HEADERS)
+    if (HAVE_NUMA_HEADERS AND HAVE_NUMAIF_HEADERS AND NUMA_LIB)
+        target_compile_definitions(llama PRIVATE USE_LIBNUMA)
+        target_link_libraries(llama PRIVATE numa)
+        message(STATUS "libnuma found, page cache will be moved to the local node using mbind() syscall. Disable with LLAMA_NUMA=OFF")
+    else()
+        message(STATUS "Warning: NUMA headers not found - consider disabling this Warning with LLAMA_NUMA=OFF")
+    endif()
+endif()
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_BUILD)
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 47497cf953fd3..c3d04fafe27fd 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -10,6 +10,12 @@
 #include <cerrno>
 #include <algorithm>
 
+#ifdef USE_LIBNUMA
+    #include <numa.h>
+    #include <numaif.h>
+    #include <sched.h>
+#endif
+
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
@@ -273,6 +279,27 @@ struct llama_mmap::impl {
 #ifdef _POSIX_MAPPED_FILES
     std::vector<std::pair<size_t, size_t>> mapped_fragments;
 
+#ifdef USE_LIBNUMA
+    static void move_pages(void *addr, size_t size) {
+        int cpu, ret;
+        struct bitmask *nodemask = numa_allocate_nodemask();
+
+        /* Get memory policy of the calling thread. */
+        ret = get_mempolicy(nullptr, nodemask->maskp, nodemask->size, nullptr, 0);
+        if (ret || numa_bitmask_weight(nodemask) == 0) {
+            cpu = sched_getcpu();
+            if (cpu >= 0) {
+                numa_bitmask_clearall(nodemask);
+                numa_bitmask_setbit(nodemask, numa_node_of_cpu(cpu));
+            }
+        }
+        if (numa_bitmask_weight(nodemask) == 1) {
+            mbind(addr, size, MPOL_BIND, nodemask->maskp, nodemask->size, MPOL_MF_MOVE);
+        }
+        numa_free_nodemask(nodemask);
+    }
+#endif
+
     impl(struct llama_file * file, size_t prefetch, bool numa) {
         size = file->size();
         int fd = file->file_id();
@@ -291,6 +318,17 @@ struct llama_mmap::impl {
         }
 
         if (prefetch > 0) {
+#ifdef USE_LIBNUMA
+            /*
+             * Given that we already pre-fault all memory when prefetch > 0, it is
+             * necessary to move any page cache pages that might have been
+             * instantiated during previous runs on different NUMA nodes. This call
+             * to move_pages() ensures that all memory-mapped pages are relocated
+             * according to the calling thread's memory policy or the CPU on which
+             * it is running.
+             */
+            move_pages(addr, file->size());
+#endif
             if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
                 LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
                         strerror(errno));