8000 Introduce New Lookup-Table(LUT)-Based Matrix Multiplication Method (TMAC) by QingtaoLi1 · Pull Request #13206 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Introduce New Lookup-Table(LUT)-Based Matrix Multiplication Method (TMAC) #13206

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "llama.h"

#include <algorithm>
#include <chrono>
#include <cinttypes>
#include <climits>
#include <cmath>
Expand Down
292 changes: 276 additions & 16 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

option(GGML_TMAC "ggml: use TMAC" OFF)

# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
Expand All @@ -217,6 +219,9 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
#

set(CMAKE_C_STANDARD 11)
if (GGML_TMAC)
set(CMAKE_C_STANDARD 17)
endif()
set(CMAKE_C_STANDARD_REQUIRED true)

set(CMAKE_CXX_STANDARD 17)
Expand Down
6 changes: 4 additions & 2 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ extern "C" {
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value);
GGML_BACKEND_API int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
Expand Down Expand Up @@ -120,12 +122,12 @@ extern "C" {

GGML_BACKEND_API void ggml_cpu_init(void);

GGML_BACKEND_API void ggml_cpu_tmac_init(const char * fname);

//
// CPU backend
//

GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);

GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
Expand Down
11 changes: 10 additions & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,16 @@ extern "C" {
// GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT = 39,
GGML_TYPE_TMAC_BN_0 = 39,
GGML_TYPE_TMAC_W2G64_0 = 40,
GGML_TYPE_TMAC_W2G64_1 = 41,
GGML_TYPE_TMAC_W2G128_0 = 42,
GGML_TYPE_TMAC_W2G128_1 = 43,
GGML_TYPE_TMAC_W4G64_0 = 44,
GGML_TYPE_TMAC_W4G64_1 = 45,
GGML_TYPE_TMAC_W4G128_0 = 46,
GGML_TYPE_TMAC_W4G128_1 = 47,
GGML_TYPE_COUNT = 48,
};

// precision
Expand Down
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ add_library(ggml-base
ggml.c
ggml-alloc.c
ggml-backend.cpp
ggml-common.h
ggml-opt.cpp
ggml-threading.cpp
ggml-threading.h
Expand Down
47 changes: 46 additions & 1 deletion ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/amx/amx.h
ggml-cpu/amx/mmq.cpp
ggml-cpu/amx/mmq.h
ggml-cpu/tmac/tmac.cpp
ggml-cpu/tmac/tmac.h
ggml-cpu/tmac/lut_mul_mat.cpp
ggml-cpu/tmac/lut_mul_mat.h
ggml-cpu/tmac/lut_ctor.cpp
ggml-cpu/tmac/lut_ctor.h
ggml-cpu/tmac/tbl.cpp
ggml-cpu/tmac/tbl.h
ggml-cpu/ggml-cpu-impl.h
ggml-cpu/common.h
ggml-cpu/binary-ops.h
Expand Down Expand Up @@ -72,6 +80,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/llamafile/sgemm.h)
endif()

if (GGML_TMAC)
target_compile_definitions(${GGML_CPU_NAME} PUBLIC GGML_USE_TMAC)
target_include_directories(${GGML_CPU_NAME} PUBLIC ggml-cpu/tmac)
get_target_property(cdefs ${GGML_CPU_NAME} COMPILE_DEFINITIONS)
message(STATUS "GGML_CPU_NAME: ${GGML_CPU_NAME} COMPILE_DEFINITIONS: ${cdefs}")

if ((NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") OR
(NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
message(FATAL_ERROR "Clang is required for T-MAC compilation")
endif()

if (GGML_TMAC_RECHUNK)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE TMAC_RECHUNK)
endif()
endif()

if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)

Expand Down Expand Up @@ -145,6 +169,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
endif()
endif()
if (GGML_TMAC)
# ARM Windows with LLVM clang GNU interface
# We need fullfp16 for T-MAC
# TODO: check_cxx_source_compiles
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
endif()

# show enabled features
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
Expand Down Expand Up @@ -181,7 +211,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
if (GGML_NATIVE)
include(ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
# Can't use GGML_AVX512 with T-MAC and Clang for MSVC
# with error: conflicting types for '_m_prefetchw
if (GGML_AVX512 AND (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") AND (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
list(APPEND ARCH_FLAGS /arch:AVX512)
# /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__
# MSVC has no compile-time flags enabling specific
Expand Down Expand Up @@ -323,6 +355,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64" AND GGML_TMAC)
# We need fullfp16 for T-MAC
# TODO: we need to simplify this logic through check_cxx_source_compiles or Presets?
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
# Device with armv8.7a+ cpu, e.g., WSL on Surface Laptop 7
# based on arm64-windows-llvm.cmake
list(APPEND ARCH_FLAGS -march=armv8.7-a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
else ()
# Jetson AGX Orin, Raspberry Pi 5
list(APPEND ARCH_FLAGS -march=armv8.2a+fp16)
endif ()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")

Expand Down
61 changes: 61 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
#include "llamafile/sgemm.h"
#endif

#ifdef GGML_USE_TMAC
#include "tmac.h"
#endif

#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
Expand Down Expand Up @@ -373,6 +377,51 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_TMAC_BN_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G64_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G64_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G128_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W2G128_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G64_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G64_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G128_0] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
[GGML_TYPE_TMAC_W4G128_1] = {
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
.vec_dot_type = GGML_TYPE_F32,
.nrows = 1,
},
};

const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
Expand Down Expand Up @@ -2639,6 +2688,14 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) {
#endif
}

void ggml_threadpool_atomic_store_explicit(struct ggml_threadpool * threadpool, int value) {
atomic_store_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
}

int ggml_threadpool_atomic_fetch_add_explicit(struct ggml_threadpool * threadpool, int value) {
return (int)atomic_fetch_add_explicit(&threadpool->current_chunk, value, memory_order_relaxed);
}

struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads,
Expand Down Expand Up @@ -3406,6 +3463,10 @@ void ggml_cpu_init(void) {
ggml_init_arm_arch_features();
#endif

#ifdef GGML_USE_TMAC
ggml_tmac_init();
#endif

is_first_call = false;
}

Expand Down
7 changes: 7 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "ggml-cpu-traits.h"
#include "ggml-impl.h"
#include "amx/amx.h"
#include "tmac/tmac.h"

#include <cctype>
#include <string>
Expand Down Expand Up @@ -43,6 +44,12 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
}
#endif

#ifdef GGML_USE_TMAC
if (ggml_backend_tmac_buffer_type()) {
bufts.push_back(ggml_backend_tmac_buffer_type());
}
#endif

#ifdef GGML_USE_CPU_KLEIDIAI
if (ggml_backend_cpu_kleidiai_buffer_type()) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
Expand Down
9 changes: 9 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4978,6 +4978,15 @@ void ggml_compute_forward_clamp(
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_TMAC_BN_0:
case GGML_TYPE_TMAC_W2G64_0:
case GGML_TYPE_TMAC_W2G64_1:
case GGML_TYPE_TMAC_W2G128_0:
case GGML_TYPE_TMAC_W2G128_1:
case GGML_TYPE_TMAC_W4G64_0:
case GGML_TYPE_TMAC_W4G64_1:
case GGML_TYPE_TMAC_W4G128_0:
case GGML_TYPE_TMAC_W4G128_1:
case GGML_TYPE_COUNT:
{
GGML_ABORT("fatal error");
Expand Down
Loading
0