Bitnet correct.

ggml-org · QingtaoLi1 · Apr 22, 2025 · May 6, 2025 · May 12, 2025 · May 13, 2025
commit 11a099fe61a3d10c02462ee0990d5f9b89f08f2f
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -299,7 +299,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
     # Repack and merge qweight, scales, and qzeros into a single tensor
     # Currently, this logic is nearly impossible to be implemented in quants.py
     def _modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if not self.enable_t_mac:
+        if not self.enable_t_mac or isinstance(self, BitnetModel):
             return self.modify_tensors(data_torch, name, bid)
 
         self._t_mac_raw_shape = None        # reset to make sure old values don't leak into new tensors case
@@ -2270,6 +2270,7 @@ def weight_quant(self, weight: Tensor) -> Tensor:
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         new_name = self.map_tensor_name(name)
 
+        self._t_mac_raw_shape = None
         if any(self.match_model_tensor_name(new_name, key, bid) for key in [
             gguf.MODEL_TENSOR.ATTN_Q,
             gguf.MODEL_TENSOR.ATTN_K,
@@ -2291,7 +2292,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 w = np.round(data / scale + 2).astype(np.uint8)
                 data_torch = torch.from_numpy(preprocess_for_t_mac(w, scale.reshape(1), bits=2))
                 self.quantization_config["bits"] = 2
-                # self.quantization_config["group_size"] = 256
+                self.quantization_config["group_size"] = -1
                 self.quantization_config["sym"] = True
                 self.quantization_config["quant_method"] = "bitnet"
                 self._t_mac_raw_shape = w.shape
@@ -5632,6 +5633,7 @@ class LazyTorchTensor(gguf.LazyBase):
     _dtype_map: dict[torch.dtype, type] = {
         torch.float16: np.float16,
         torch.float32: np.float32,
+        torch.bfloat16: np.float32,
     }
 
     # used for safetensors slices

diff --git a/ggml/src/ggml-cpu/tmac/lut_mul_mat.cpp b/ggml/src/ggml-cpu/tmac/lut_mul_mat.cpp
@@ -231,7 +231,7 @@ static void aligned_free(void * ptr) {
 
 
 /****** T-MAC meta model info ******/
-static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, struct tmac_kernel_config * kernel_config) {
+static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, int M, struct tmac_kernel_config * kernel_config) {
     kernel_config->bits = get_type_bits(type);
     kernel_config->q_group_size = get_type_group_size(type);
     kernel_config->has_zero_point = get_type_has_zero_point(type);
@@ -241,6 +241,22 @@ static void init_tmac_kernel_config_from_tensor_type(enum ggml_type type, struct
     kernel_config->has_scale = true;
     kernel_config->g = 4;
     kernel_config->ngroups_per_elem = 8 / kernel_config->g;
+
+    // Decide q_group_size for BN_0
+    if (kernel_config->q_group_size == -1) {
+        if (M % 256 == 0) {
+            kernel_config->q_group_size = 64;
+        } else if (M % 128 == 0) {
+            kernel_config->q_group_size = 64;
+        } else if (M % 64 == 0) {
+            kernel_config->q_group_size = 64;
+        } else if (M % 32 == 0) {
+            kernel_config->q_group_size = 32;
+        } else {
+            GGML_LOG_ERROR("Unsupported M value. Expected multiple of 32, got %d. Please check all of the model weight shapes.\n", M);
+        }
+    }
+
     if (kernel_config->q_group_size % 64 == 0) {
         kernel_config->act_group_size = 64;
     } else if (kernel_config->q_group_size % 32 == 0) {
@@ -377,7 +393,7 @@ static void ggml_tmac_tune_kernel_config(const struct ggml_tensor * tensor, int
     }
 
     struct tmac_kernel_config kernel_config;
-    init_tmac_kernel_config_from_tensor_type(tensor->type, &kernel_config);
+    init_tmac_kernel_config_from_tensor_type(tensor->type, M, &kernel_config);
 
     // TODO: add more choices for prefilling?
     int N = 1;
@@ -480,6 +496,7 @@ size_t ggml_tmac_get_nbytes(const struct ggml_tensor * tensor) {
     const int scales_size = ggml_tmac_get_scales_size(kernel_config, m, k);
     // Currently, always uses float to store scales or zero points
     size_t nbytes = k * m / 8 * bits + scales_size * sizeof(float);
+    nbytes = GGML_PAD(nbytes, GGUF_DEFAULT_ALIGNMENT);
     // printf("ggml_tmac_get_nbytes: %s --- k=%d, m=%d, w=%d, sc=%d, nbytes: %zu\n", tensor->name, k, m, k * m / 8 * bits, scales_size, nbytes);
     return nbytes;
 }

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -572,8 +572,8 @@ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf
 static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_TMAC_BN_0] = {
         .type_name                = "tmac_bn_0",
-        .blck_size                = 256,
-        .type_size                = 4 + 256 * 2 / 8,
+        .blck_size                = 64,
+        .type_size                = 64 * 2 / 8,
         .is_quantized             = false,
     },
     [GGML_TYPE_TMAC_W2G64_0] = {
@@ -1224,6 +1224,12 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
         }
     }
 
+    if (tensor->type == GGML_TYPE_TMAC_BN_0) {
+        // One scale will not exceed one alignment boundary, so we can just add one alignment to the size.
+        nbytes += GGUF_DEFAULT_ALIGNMENT;
+    }
+
+
     return nbytes;
 }
 

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -2036,7 +2036,7 @@ def get_type(val: Any) -> GGUFValueType:
     # - So the size is slightly smaller than the real size
     # - The n_bytes in gguf_reader.py is thus inaccurate
     # - During inference, the accurate nbytes info will be known through ggml_tmac_get_nbytes
-    GGMLQuantizationType.TMAC_BN_0:         (256, 4 + 256 * 2 // 8),
+    GGMLQuantizationType.TMAC_BN_0:         (64,  64 * 2 // 8),
     GGMLQuantizationType.TMAC_W2G64_0:      (64,  4 + 64 * 2 // 8),
     GGMLQuantizationType.TMAC_W2G64_1:      (64,  4 + 4 + 64 * 2 // 8),
     GGMLQuantizationType.TMAC_W2G128_0:     (128, 4 + 128 * 2 // 8),