Align UT with triton fill_kv_cache_quant kernel (InternLM#2644)

AllentDan · web-flow · commit 8e794b70ac77 · 2024-10-25T14:26:26.000+08:00
diff --git a/tests/pytorch/kernel/test_fill_kv_cache.py b/tests/pytorch/kernel/test_fill_kv_cache.py
@@ -8,17 +8,13 @@ def _div_up(a, b):
     return (a + b - 1) // b
 
 
-def precise_round(x: torch.Tensor):
-    return x.sign() * (x.abs() + 0.5).floor()
-
-
 def quant(kv: torch.Tensor, nbits: int = 8):
     """Quant kv on the head_dim."""
     amax = kv.amax(dim=-1, keepdim=True)
     amin = kv.amin(dim=-1, keepdim=True)
     scales = (amax - amin) / (2**nbits - 1)
     zeros = -amin / scales
-    q_kv = precise_round((kv - amin) / scales).to(torch.uint8)
+    q_kv = (kv / scales + zeros + 0.5).to(torch.uint8)
     if nbits == 4:
         q_kv1, q_kv2 = q_kv.split(q_kv.shape[-1] // 2, -1)
         q_kv = q_kv1 + q_kv2 * 16