-
Notifications
You must be signed in to change notification settings - Fork 24.2k
Update auto-tuning support for _scaled_grouped_mm #150944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: gh/alexsamardzic/1/base
Are you sure you want to change the base?
Conversation
🔗 Helpful Links🧪 See artifacts and rendered test results at hud.pytorch.org/pr/150944
Note: Links to docs will display an error until the docs builds have been completed. ✅ No FailuresAs of commit 74b8536 with merge base b03e4f5 ( This comment was automatically generated by Dr. CI and updates every 15 minutes. |
Validation scriptfrom enum import Enum
from itertools import product
import torch
f_ref = torch._scaled_grouped_mm
f = torch.compile(
f_ref,
options={
"max_autotune": True,
"max_autotune_gemm_backends": "TRITON",
},
)
class MMType(Enum):
MM_2D_2D = 1
MM_2D_3D = 2
MM_3D_2D = 3
MM_3D_3D = 4
def generate_data(
mm_type, group_size, M, N, K, device, dtype_AB, dtype_scale, dtype_offset, strided
):
if mm_type == MMType.MM_2D_2D:
A = torch.randn(M, K * (group_size + strided), device=device).to(dtype_AB)[
:, : K * group_size
]
B = torch.randn(N, K * (group_size + strided), device=device).to(dtype_AB)[
:, : K * group_size
]
A_scale = torch.rand(group_size * M, device=device, dtype=dtype_scale)
B_scale = torch.rand(group_size * N, device=device, dtype=dtype_scale)
offs = torch.arange(K, group_size * K + 1, K, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_2D_3D:
A = torch.randn(M * group_size, K * (1 + strided), device=device).to(dtype_AB)[
:, :K
]
B = torch.randn(
group_size * (1 + strided), N, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
A_scale = torch.rand(group_size * M, device=device, dtype=dtype_scale)
B_scale = torch.rand(
group_size, N * (1 + strided), device=device, dtype=dtype_scale
)[:, :N]
offs = torch.arange(M, group_size * M + 1, M, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_3D_2D:
A = torch.randn(
group_size * (1 + strided), M, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
B = torch.randn(N * group_size, K * (1 + strided), device=device).to(dtype_AB)[
:, :K
]
A_scale = torch.rand(
group_size, M * (1 + strided), device=device, dtype=dtype_scale
)[:, :M]
B_scale = torch.rand(group_size * N, device=device, dtype=dtype_scale)
offs = torch.arange(N, group_size * N + 1, N, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_3D_3D:
A = torch.randn(
group_size * (1 + strided), M, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
B = torch.randn(
group_size * (1 + strided), N, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
A_scale = torch.rand(group_size, M * (1 + strided), device=device).to(
dtype_scale
)[:, :M]
B_scale = torch.rand(group_size, N * (1 + strided), device=device).to(
dtype_scale
)[:, :N]
offs = None
if offs is not None:
if offs[0] >= 32:
offs[0] -= 16
offs[2] += 16
elif offs[0] >= 64:
offs[0] -= 16
offs[1] += 16
offs[2] -= 32
return A, B, A_scale, B_scale, offs
def validate():
def validate_helper(
mm_type,
group_size,
M,
N,
K,
device,
dtype_AB,
dtype_scale,
dtype_offset,
dtype_C,
use_fast_accum,
strided,
atol,
rtol,
):
torch._dynamo.reset()
A, B, A_scale, B_scale, offs = generate_data(
mm_type,
group_size,
M,
N,
K,
device,
dtype_AB,
dtype_scale,
dtype_offset,
strided,
)
C_ref = f_ref(
A,
B.transpose(-2, -1),
A_scale,
B_scale,
offs,
out_dtype=dtype_C,
use_fast_accum=use_fast_accum,
)
C = f(
A,
B.transpose(-2, -1),
A_scale,
B_scale,
offs,
out_dtype=dtype_C,
use_fast_accum=use_fast_accum,
)
assert torch.allclose(C, C_ref, atol=atol, rtol=rtol)
device = "cuda"
group_size = 4
M_range = [2**i for i in range(4, 6)]
N_range = [2**i for i in range(5, 8)]
K_range = [2**i for i in range(6, 9)]
dtype_AB = torch.float8_e4m3fn
dtype_scale = torch.float32
dtype_offset = torch.int32
dtype_C = torch.bfloat16
use_fast_accum_range = [False, True]
strided_range = [False, True]
atol = 1e-2
rtol = 1e-2
for mm_type, M, N, K, use_fast_accum, strided in product(
MMType, M_range, N_range, K_range, use_fast_accum_range, strided_range
):
validate_helper(
mm_type,
group_size,
M,
N,
K,
device,
dtype_AB,
dtype_scale,
dtype_offset,
dtype_C,
use_fast_accum,
strided,
atol,
rtol,
)
validate() (Note: to validate non-TMA load variant, change Todo: handle use_fast_accum case like CUTLASS does it... |
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K ghstack-source-id: e6016b7 Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K ghstack-source-id: 63a6271 Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K ghstack-source-id: bbff45c Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Implemented meta registration ghstack-source-id: 5bbfaae Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Implement meta registration ghstack-source-id: 5bbfaae Pull Request resolved: #150944
@bertmaher @ngimel This PR is ready for review. I'll update the test along the way, and then proceed onto grouped (non-scaled) MM auto-tuning support. |
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Implement meta registration ghstack-source-id: 111429c Pull Request resolved: #150944
Hey Alex just wanted to apologize for not getting to this sooner -- been a very crazy week or two for me but this is near the top of my queue finally. Btw you will probably want something like #152968 to fix a silly bug in autotuning that I introduced |
NP, as soon as this PR or yours gets merged, and I assume it's quickly, I'll rebase my PR, and make appropriate additions for the inputs layout combinations other than 2d/3d. |
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: ab5036b Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: 8b476d2 Pull Request resolved: #150944
@alexsamardzic I added meta kernel in my previous PR #153226 can you please rebase to use it? |
It is rebased already - this version should have all the checks as yours, and then some more.
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good! Do you have some benchmarking results to make sure that the changes didn't regress the kernel perf compared to just dynamic M kernel? I'm especially curious what perf you are getting compared to eager for 2d-2d case where you require masking all the inputs.
test/test_matmul_cuda.py
Outdated
@@ -1627,11 +1632,16 @@ def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided, use_torch_compile) | |||
offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32) | |||
if check_zero_size: | |||
offs[0] = offs[1] | |||
scale_a = torch.arange(n_groups * m, device="cuda", dtype=torch.float32) | |||
scale_b = torch.ones(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n) | |||
scale_a = torch.arange(n_groups * m, device="cuda", dtype=torch.float32) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scale_a
, scale_b
should be constructed regardless of check_zero_size
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, it seems I messed up indentation while updating this file.
@@ -53,8 +47,8 @@ class Config: | |||
num_warps=num_warps, | |||
) | |||
for block_size_m in [64, 128] | |||
for block_size_n in [64, 128, 256] | |||
for block_size_k in [64, 128, 256] | |||
for block_size_n in [64, 128] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
was block size 256 never picked?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverted. It seemed rarely picked, but primary reason for deletion was to speedup testing, then the change slipped into an update.
m, k1 = m1_size | ||
g, k2, n = m2_size | ||
k = V.graph.sizevars.guard_equals(k1, k2) | ||
if is_nonzero and can_use_triton_kernel(mat_a, mat_b, offs, bias, scale_result): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where are you checking that a and b are row- and column- major respectively? Do you rely on meta function to check that?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but - do you think guards should be put here too to check on strides for row-major/column-major ordering?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's fine to not double check on those, but can you add a comment saying that you are relying on meta function checks so that it's easier to keep in sync?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me! Thank you for doing this. I have a few inline questions/comments
@@ -1643,7 +1653,7 @@ def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided, use_torch_compile) | |||
ascalelist.append(scale_a[start:offs_cpu[i]]) | |||
outlist.append(out[start:offs_cpu[i]]) | |||
start = offs_cpu[i] | |||
self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum) | |||
self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was this indent accidental?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, reverted - as mentioned above, apparently messed up some indentation when updating this file.
torch/_meta_registrations.py
Outdated
@@ -6379,6 +6379,165 @@ def ceil_div(a, b): | |||
return torch.empty(self.size(0), mat2.size(1), dtype=_out_dtype, device=self.device) | |||
|
|||
|
|||
@register_meta([aten._scaled_grouped_mm.default]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you avoid moving meta_scaled_grouped_mm
around in this PR? It makes it hard to see what all changed. (If it's being moved for a good reason, can you explain what that is? It's hard for me to tell from the diff)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved back. (I've added it in my PR before it get added into the main, and I though it may make sense to put it right after meta registration for _scaled_mm
.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I put it next to grouped_mm which also makes sense, scaled_grouped_mm has 2 attributes and it's unclear which one is more important ;-)
@@ -167,46 +123,127 @@ def early_config_prune(configs, named_args): | |||
|
|||
# Copied from fbgemm grouped_gemm.py | |||
triton_scaled_grouped_mm_source = r""" | |||
{{def_kernel("a_ptr", "b_ptr", "a_scale_ptr", "b_scale_ptr", "m_sizes")}} | |||
{% if A_IS_2D or B_IS_2D %} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was kind of contemplating whether it would be better to have a separate kernel for each 2d/3d case, since nested control flow in jinja templates is kind of hard to read. But it would come with the downsides of some amount of code duplication. Just curious to hear your opinion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For me, it's just a matter of a trade-off... It seems to me the differences between the variations are not that big to warrant separate kernels, it would be too much code duplication. Also, non-scaled version could be easily added into the same code, the same way (which I'm going to do next). Furthermore, the Jinja if-else statements make the generated code readable (IMO), basically this generated code is like a separated version for given 2d/3d case. There are only two cases of nested if-else Jinja statements; I could use some kind of indentation to make it more clear. Overall, I'd prefer to keep it as is throughout the development, as for me it's easier to work with it this way; when we're happy with it, we could arrange it differently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be cool if torchtitan (that has eager versions of these kernels) also adopted a single kernel approach, cc @lessw2020
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: 894be5f Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: 4856f96 Pull Request resolved: #150944
I'm doing benchmarking over the last two days, indeed it seems going "fully dynamic" is not good for performance. So I'm probably going to revert some things back. |
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: 067deb6 Pull Request resolved: #150944
1. Enable strided inputs 2. Implement "2d/2d", "3d/2d" and "3d/3d" combinations of inputs 3. Fix non-TMA load variant 4. Replace experimental_device_tensormap_create2d with _experimental_make_tensor_descriptor 5. Fix cases when group size along K dimension is not multiple of block size along K 6. Update meta registration 7. Updated synthetic offsets creation ghstack-source-id: 55e56c4 Pull Request resolved: #150944
fwiw this blog post https://pytorch.org/blog/metashuffling-accelerating-llama-4-moe-inference/ contains good set of benchmarking configs (and it's based on the same triton kernel, save for warp specialization which typically has only very small effect) |
Here is a benchmarking script, and some benchmarking results (number of groups is 4 everywhere, "CUTLASS" here means eager CUTLASS-based Benchmarking scriptfrom enum import Enum
import pandas as pd
from tqdm import tqdm
import torch
from triton.testing import do_bench
class MMType(Enum):
MM_2D_2D = 1
MM_2D_3D = 2
MM_3D_2D = 3
MM_3D_3D = 4
def __str__(self):
if self == MMType.MM_2D_2D:
return "2d_2d"
elif self == MMType.MM_2D_3D:
return "2d_3d"
elif self == MMType.MM_3D_2D:
return "3d_2d"
elif self == MMType.MM_3D_3D:
return "3d_3d"
else:
return ""
device = "cuda"
dtype_AB = torch.float8_e4m3fn
dtype_scale = torch.float32
dtype_offset = torch.int32
dtype_C = torch.bfloat16
group_size = 4
use_fast_accum = True
strided = True
f_ref = torch._scaled_grouped_mm
f = torch.compile(
f_ref,
options={
"max_autotune": True,
"max_autotune_gemm_backends": "TRITON",
},
)
def benchmark_microseconds(f, *args, **kwargs):
return do_bench(lambda: f(*args, **kwargs), return_mode="median") * 1e3
def get_problem(mm_type, M, N, K):
if mm_type == MMType.MM_2D_2D:
A = torch.randn(M, K * (group_size + strided), device=device).to(dtype_AB)[
:, : K * group_size
]
B = torch.randn(N, K * (group_size + strided), device=device).to(dtype_AB)[
:, : K * group_size
]
A_scale = torch.rand(group_size * M, device=device, dtype=dtype_scale)
B_scale = torch.rand(group_size * N, device=device, dtype=dtype_scale)
offs = torch.arange(K, group_size * K + 1, K, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_2D_3D:
A = torch.randn(M * group_size, K * (1 + strided), device=device).to(dtype_AB)[
:, :K
]
B = torch.randn(
group_size * (1 + strided), N, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
A_scale = torch.rand(group_size * M, device=device, dtype=dtype_scale)
B_scale = torch.rand(
group_size, N * (1 + strided), device=device, dtype=dtype_scale
)[:, :N]
offs = torch.arange(M, group_size * M + 1, M, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_3D_2D:
A = torch.randn(
group_size * (1 + strided), M, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
B = torch.randn(N * group_size, K * (1 + strided), device=device).to(dtype_AB)[
:, :K
]
A_scale = torch.rand(
group_size, M * (1 + strided), device=device, dtype=dtype_scale
)[:, :M]
B_scale = torch.rand(group_size * N, device=device, dtype=dtype_scale)
offs = torch.arange(N, group_size * N + 1, N, device=device, dtype=dtype_offset)
if mm_type == MMType.MM_3D_3D:
A = torch.randn(
group_size * (1 + strided), M, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
B = torch.randn(
group_size * (1 + strided), N, K * (1 + strided), device=device
).to(dtype_AB)[:: (1 + strided), :, :K]
A_scale = torch.rand(group_size, M * (1 + strided), device=device).to(
dtype_scale
)[:, :M]
B_scale = torch.rand(group_size, N * (1 + strided), device=device).to(
dtype_scale
)[:, :N]
offs = None
if mm_type not in [MMType.MM_2D_3D, MMType.MM_3D_3D]:
if group_size >= 2:
offs[0] = offs[0] + (offs[1] - offs[0]) // 4
if group_size >= 4:
offs[2] = offs[2] + (offs[3] - offs[2]) // 2
return A, B, A_scale, B_scale, offs
def benchmark(mm_type: MMType, m: int, k: int, n: int):
torch._dynamo.reset() # FIXME: remove this!
A, B, A_scale, B_scale, offs = get_problem(mm_type, m, n, k)
eager_time = benchmark_microseconds(
f_ref,
A,
B.transpose(-2, -1),
A_scale,
B_scale,
offs,
out_dtype=dtype_C,
use_fast_accum=use_fast_accum,
)
compiled_time = benchmark_microseconds(
f,
A,
B.transpose(-2, -1),
A_scale,
B_scale,
offs,
out_dtype=dtype_C,
use_fast_accum=use_fast_accum,
)
return {
"m": m,
"n": n,
"k": k,
"Eager (CUTLASS) latency (ms)": eager_time,
"Compiled (Triton) latency (ms)": compiled_time,
"Compiled speedup (d/s)": eager_time / compiled_time,
}
if __name__ == "__main__":
k_vals = (1024, 2048, 4096)
n_vals = (1024, 2048, 4096)
for mm_type in MMType:
results = []
i_range = range(8) if mm_type != MMType.MM_2D_3D else range(4, 10)
for m in tqdm([1 << i for i in i_range]):
for n, k in zip(n_vals, k_vals):
results.append(benchmark(mm_type, m, k, n))
df = pd.DataFrame(results)
df.to_csv(
f"scaled_grouped_mm_{str(mm_type)}_time_results.csv", index=False
)
print(df.to_markdown(index=False)) Benchmarking results for 2D/2D case
Benchmarking results for 2D/3D case
Benchmarking results for 3D/2D case
Benchmarking results for 3D/3D case
Some comments:
Auxiliary script to produce the tables above from the .csv files with dynamic/non-dynamic resultsimport sys
import pandas as pd
assert len(sys.argv) == 3
df1 = pd.read_csv(sys.argv[1])
df1 = df1.drop(["Compiled speedup (d/s)"], axis=1)
df1 = df1.rename(
columns={
"Eager (CUTLASS) latency (ms)": "CUTLASS latency (D)",
"Compiled (Triton) latency (ms)": "Triton latency (D)",
}
)
df2 = pd.read_csv(sys.argv[2])
df2 = df2.drop(["Compiled speedup (d/s)"], axis=1)
df2 = df2.rename(
columns={
"Eager (CUTLASS) latency (ms)": "CUTLASS latency (ND)",
"Compiled (Triton) latency (ms)": "Triton latency (ND)",
}
)
df = pd.merge(df1, df2, on=["m", "n", "k"])
df.insert(
loc=3,
column="CUTLASS latency",
value=(df["CUTLASS latency (D)"] + df["CUTLASS latency (ND)"]) / 2,
)
df = df.drop(["CUTLASS latency (D)", "CUTLASS latency (ND)"], axis=1)
df["Triton speedup (D)"] = df["CUTLASS latency"] / df["Triton latency (D)"]
df["Triton speedup (ND)"] = df["CUTLASS latency"] / df["Triton latency (ND)"]
float_columns = df.select_dtypes(include=['float64', 'float32']).columns
for col in float_columns:
df[col] = df[col].apply(lambda x: f"{x:.2f}")
print(df.to_markdown(index=False)) |
Stack from ghstack (oldest at bottom):
cc @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @ipiszy @chenyang78 @kadeng @muchulee8 @amjames @chauhang @aakhundov