diff --git a/CMakeLists.txt b/CMakeLists.txt index ac3e9090336d9..715adb13e3120 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) +option(LLAMA_NUMA "llama: use libnuma to get memory policy of the llama-bench" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) # Required for relocatable CMake package diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4bf37b1cf3e5..17079c7f25c47 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,6 +39,19 @@ target_compile_features (llama PRIVATE cxx_std_17) # don't bump target_link_libraries(llama PUBLIC ggml) +if (LLAMA_NUMA) + find_library(NUMA_LIB numa) + check_include_file_cxx("numa.h" HAVE_NUMA_HEADERS) + check_include_file_cxx("numaif.h" HAVE_NUMAIF_HEADERS) + if (HAVE_NUMA_HEADERS AND HAVE_NUMAIF_HEADERS AND NUMA_LIB) + target_compile_definitions(llama PRIVATE USE_LIBNUMA) + target_link_libraries(llama PRIVATE numa) + message(STATUS "libnuma found, page cache will be moved to the local node using mbind() syscall. Disable with LLAMA_NUMA=OFF") + else() + message(STATUS "Warning: NUMA headers not found - consider disabling this Warning with LLAMA_NUMA=OFF") + endif() +endif() + if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(llama PRIVATE LLAMA_BUILD) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 9da97f1bc5057..44dd861423108 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -10,6 +10,12 @@ #include #include +#ifdef USE_LIBNUMA + #include + #include + #include +#endif + #ifdef __has_include #if __has_include() #include @@ -273,6 +279,27 @@ struct llama_mmap::impl { #ifdef _POSIX_MAPPED_FILES std::vector> mapped_fragments; +#ifdef USE_LIBNUMA + static void move_pages(void *addr, size_t size) { + int cpu, ret; + struct bitmask *nodemask = numa_allocate_nodemask(); + + /* Get memory policy of the calling thread. */ + ret = get_mempolicy(nullptr, nodemask->maskp, nodemask->size, nullptr, 0); + if (ret || numa_bitmask_weight(nodemask) == 0) { + cpu = sched_getcpu(); + if (cpu >= 0) { + numa_bitmask_clearall(nodemask); + numa_bitmask_setbit(nodemask, numa_node_of_cpu(cpu)); + } + } + if (numa_bitmask_weight(nodemask) == 1) { + mbind(addr, size, MPOL_BIND, nodemask->maskp, nodemask->size, MPOL_MF_MOVE); + } + numa_free_nodemask(nodemask); + } +#endif + impl(struct llama_file * file, size_t prefetch, bool numa) { size = file->size(); int fd = file->file_id(); @@ -291,6 +318,17 @@ struct llama_mmap::impl { } if (prefetch > 0) { +#ifdef USE_LIBNUMA + /* + * Given that we already pre-fault all memory when prefetch > 0, it is + * necessary to move any page cache pages that might have been + * instantiated during previous runs on different NUMA nodes. This call + * to move_pages() ensures that all memory-mapped pages are relocated + * according to the calling thread's memory policy or the CPU on which + * it is running. + */ + move_pages(addr, file->size()); +#endif if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) { LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", strerror(errno));