8000 Introduce New Lookup-Table(LUT)-Based Matrix Multiplication Method (TMAC) by QingtaoLi1 · Pull Request #13206 · ggml-org/llama.cpp · GitHub
[go: up one dir, main page]

Skip to content

Introduce New Lookup-Table(LUT)-Based Matrix Multiplication Method (TMAC) #13206

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Bitnet correct.
  • Loading branch information
QingtaoLi1 committed May 13, 2025
commit 11a099fe61a3d10c02462ee0990d5f9b89f08f2f
6 changes: 4 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
# Repack and merge qweight, scales, and qzeros into a single tensor
# Currently, this logic is nearly impossible to be implemented in quants.py
def _modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not self.enable_t_mac:
if not self.enable_t_mac or isinstance(self, BitnetModel):
return self.modify_tensors(data_torch, name, bid)

self._t_mac_raw_shape = None # reset to make sure old values don't leak into new tensors case
Expand Down Expand Up @@ -2270,6 +2270,7 @@ def weight_quant(self, weight: Tensor) -> Tensor:
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name)

self._t_mac_raw_shape = None
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
gguf.MODEL_TENSOR.ATTN_Q,
gguf.MODEL_TENSOR.ATTN_K,
Expand All @@ -2291,7 +2292,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
w = np.round(data / scale + 2).astype(np.uint8)
data_torch = torch.from_numpy(preprocess_for_t_mac(w, scale.reshape(1), bits=2))
self.quantization_config["bits"] = 2
# self.quantization_config["group_size"] = 256
self.quantization_config["group_size"] = -1
self.quantization_config["sym"] = True
self.quantization_config["quant_method"] = "bitnet"
self._t_mac_raw_shape = w.shape
Expand Down Expand Up @@ -5632,6 +5633,7 @@ class LazyTorchTensor(gguf.LazyBase):
_dtype_map: dict[torch.dtype, type] = {
torch.float16: np.float16,
torch.float32: np.float32,
torch.bfloat16: np.float32,
}

# used for safetensors slices
Expand Down
21 changes: 19 additions & 2 deletions 8000 ggml/src/ggml-cpu/tmac/lut_mul_mat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ static void aligned_free(void * ptr) {


/****** T-MAC meta model info ******/
static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, struct tmac_kernel_config * kernel_config) {
static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, int M, struct tmac_kernel_config * kernel_config) {
kernel_config->bits = get_type_bits(type);
kernel_config->q_group_size = get_type_group_size(type);
kernel_config->has_zero_point = get_type_has_zero_point(type);
Expand All @@ -241,6 +241,22 @@ static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, struct
kernel_config->has_scale = true;
kernel_config->g = 4;
kernel_config->ngroups_per_elem = 8 / kernel_config->g;

// Decide q_group_size for BN_0
if (kernel_config->q_group_size == -1) {
if (M % 256 == 0) {
kernel_config->q_group_size = 64;
} else if (M % 128 == 0) {
kernel_config->q_group_size = 64;
} else if (M % 64 == 0) {
kernel_config->q_group_size = 64;
} else if (M % 32 == 0) {
kernel_config->q_group_size = 32;
} else {
GGML_LOG_ERROR("Unsupported M value. Expected multiple of 32, got %d. Please check all of the model weight shapes.\n", M);
}
}

if (kernel_config->q_group_size % 64 == 0) {
kernel_config->act_group_size = 64;
} else if (kernel_config->q_group_size % 32 == 0) {
Expand Down Expand Up @@ -377,7 +393,7 @@ static void ggml_tmac_tune_kernel_config(const struct ggml_tensor * tensor, int
}

struct tmac_kernel_config kernel_config;
init_tmac_kernel_config_from_tensor_type(tensor->type, &kernel_config);
init_tmac_kernel_config_from_tensor_type(tensor->type, M, &kernel_config);

// TODO: add more choices for prefilling?
int N = 1;
Expand Down Expand Up @@ -480,6 +496,7 @@ size_t ggml_tmac_get_nbytes(const struct ggml_tensor * tensor) {
const int scales_size = ggml_tmac_get_scales_size(kernel_config, m, k);
// Currently, always uses float to store scales or zero points
size_t nbytes = k * m / 8 * bits + scales_size * sizeof(float);
nbytes = GGML_PAD(nbytes, GGUF_DEFAULT_ALIGNMENT);
// printf("ggml_tmac_get_nbytes: %s --- k=%d, m=%d, w=%d, sc=%d, nbytes: %zu\n", tensor->name, k, m, k * m / 8 * bits, scales_size, nbytes);
return nbytes;
}
Expand Down
10 changes: 8 additions & 2 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -572,8 +572,8 @@ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_TMAC_BN_0] = {
.type_name = "tmac_bn_0",
.blck_size = 256,
.type_size = 4 + 256 * 2 / 8,
.blck_size = 64,
.type_size = 64 * 2 / 8,
.is_quantized = false,
},
[GGML_TYPE_TMAC_W2G64_0] = {
Expand Down Expand Up @@ -1224,6 +1224,12 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
}
}

if (tensor->type == GGML_TYPE_TMAC_BN_0) {
// One scale will not exceed one alignment boundary, so we can just add one alignment to the size.
nbytes += GGUF_DEFAULT_ALIGNMENT;
}


return nbytes;
}

Expand Down
2 changes: 1 addition & 1 deletion gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2036,7 +2036,7 @@ def get_type(val: Any) -> GGUFValueType:
# - So the size is slightly smaller than the real size
# - The n_bytes in gguf_reader.py is thus inaccurate
# - During inference, the accurate nbytes info will be known through ggml_tmac_get_nbytes
GGMLQuantizationType.TMAC_BN_0: (256, 4 + 256 * 2 // 8),
GGMLQuantizationType.TMAC_BN_0: (64, 64 * 2 // 8),
GGMLQuantizationType.TMAC_W2G64_0: (64, 4 + 64 * 2 // 8),
GGMLQuantizationType.TMAC_W2G64_1: (64, 4 + 4 + 64 * 2 // 8),
GGMLQuantizationType.TMAC_W2G128_0: (128, 4 + 128 * 2 // 8),
Expand Down
0