pytorch
diff --git a/‎test/test_matmul_cuda.py
Lines changed: 24 additions & 12 deletions b/‎test/test_matmul_cuda.py
Lines changed: 24 additions & 12 deletions
diff --git a/‎torch/_dynamo/trace_rules.py
Lines changed: 1 addition & 0 deletions b/‎torch/_dynamo/trace_rules.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/_inductor/graph.py
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/graph.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/_inductor/kernel/mm_common.py
Lines changed: 30 additions & 1 deletion b/‎torch/_inductor/kernel/mm_common.py
Lines changed: 30 additions & 1 deletion
@@ -1440,16 +1440,19 @@ def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist
     @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
     @parametrize("fast_accum", [False, True])
     @parametrize("strided", [False, True])
-    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided):
+    @parametrize("use_torch_compile", [False, True])
+    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided, use_torch_compile):
         device = "cuda"
         m, n, k, n_groups = 16, 16, 16, 4  # all sizes have to be divisible by 16
         a = torch.randn(m, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
         b = torch.randn(n, k * n_groups + k * int(strided), device=device).to(torch.float8_e4m3fn)[:, :k * n_groups]
         scale_a = torch.arange(m * n_groups, device=device, dtype=torch.float32) / 4
         scale_b = torch.arange(n * n_groups, device=device, dtype=torch.float32) / 4
         offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
-        out = torch._scaled_grouped_mm(a, b.t(), scale_a, scale_b, offs=offs,
-                                       out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        f = torch._scaled_grouped_mm
+        f = torch.compile(f) if use_torch_compile else f
+        out = f(a, b.t(), scale_a, scale_b, offs=offs,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
         offs_cpu = offs.cpu()
         alist, blist, ascalelist, bscalelist = [], [], [], []
         start = 0
@@ -1466,7 +1469,8 @@ def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided):
     @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
     @parametrize("fast_accum", [False, True])
     @parametrize("strided", [False, True])
-    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
+    @parametrize("use_torch_compile", [False, True])
+    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided, use_torch_compile):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
@@ -1478,8 +1482,10 @@ def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
         scale_a = torch.arange(n_groups * m, device="cuda", dtype=torch.float32)
         scale_b = torch.ones(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
 
-        out = torch._scaled_grouped_mm(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
-                                       out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        f = torch._scaled_grouped_mm
+        f = torch.compile(f) if use_torch_compile else f
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
 
         offs_cpu = offs.cpu()
         alist, ascalelist, outlist = [], [], []
@@ -1496,7 +1502,8 @@ def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
     @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
     @parametrize("fast_accum", [False, True])
     @parametrize("strided", [False, True])
-    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
+    @parametrize("use_torch_compile", [False, True])
+    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided, use_torch_compile):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
@@ -1507,8 +1514,10 @@ def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
         scale_a = torch.ones(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
         scale_b = torch.ones(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
 
-        out = torch._scaled_grouped_mm(a, b.transpose(-2, -1), scale_a, scale_b,
-                                       out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        f = torch._scaled_grouped_mm
+        f = torch.compile(f) if use_torch_compile else f
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
 
         self.scaled_grouped_mm_helper(a, b, scale_a, scale_b, out, fast_accum)
 
@@ -1517,7 +1526,8 @@ def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
     @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
     @parametrize("fast_accum", [False, True])
     @parametrize("strided", [False, True])
-    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
+    @parametrize("use_torch_compile", [False, True])
+    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided, use_torch_compile):
         device = "cuda"
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 16, 4
@@ -1529,8 +1539,10 @@ def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
         scale_b = torch.arange(n_groups * n, device="cuda", dtype=torch.float32)
         offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
 
-        out = torch._scaled_grouped_mm(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
-                                       out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+        f = torch._scaled_grouped_mm
+        f = torch.compile(f) if use_torch_compile else f
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
         offs_cpu = offs.cpu()
         blist, bscalelist, outlist = [], [], []
         start = 0
 
@@ -1577,6 +1577,7 @@
         "torch._scaled_dot_product_flash_attention_for_cpu",
         "torch._scaled_dot_product_cudnn_attention",
         "torch._scaled_mm",
+        "torch._scaled_grouped_mm",
         "torch._shape_as_tensor",
         "torch._sobol_engine_draw",
         "torch._sobol_engine_ff_",
 
@@ -204,6 +204,7 @@ def mark_nodes_dislike_padding(
             aten.convolution,
             aten.convolution_backward,
             aten._scaled_mm,
+            aten._scaled_grouped_mm,
         ]
     )
     # what's a better way to collect the reduction ops?
 
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import logging
+from collections.abc import Sequence
 from typing import Any
 
 import sympy
@@ -10,7 +11,7 @@
 
 from .. import config as inductor_config
 from ..codegen.wrapper import PythonWrapperCodegen
-from ..ir import ChoiceCaller, Layout
+from ..ir import _IntLike, ChoiceCaller, Layout, TensorBox
 from ..utils import get_num_sms, TMA_DESCRIPTOR_SIZE, use_aten_gemm_kernels
 
 
@@ -54,6 +55,11 @@ def persistent_mm_grid(M: int, N: int, meta: dict[str, Any], *, cdiv, min):
     )
 
 
+@SymbolicGridFn
+def persistent_grouped_mm_grid(m, n, meta):
+    return (meta["NUM_SMS"], 1, 1)
+
+
 def acc_type(dtype):
     if dtype in (torch.float16, torch.bfloat16):
         return "tl.float32"
@@ -259,3 +265,26 @@ def _is_static_problem(layout: Layout) -> tuple[bool, bool]:
         numel *= dim
     nonzero = numel > 0
     return static_shape, nonzero
+
+
+def check_supported_striding(mat_a: TensorBox, mat_b: TensorBox) -> None:
+    def is_row_major(stride: Sequence[_IntLike]) -> bool:
+        return stride[-1] == 1
+
+    def is_col_major(stride: Sequence[_IntLike]) -> bool:
+        return stride[-2] == 1
+
+    def has_zero_dim(size: Sequence[_IntLike]) -> bool:
+        return bool(size[0] == 0 or size[1] == 0)
+
+    # Check mat_a (self) stride requirements
+    torch._check(
+        is_row_major(mat_a.get_stride()) or has_zero_dim(mat_a.get_size()),
+        lambda: f"mat_a must be row_major, got stride {mat_a.get_stride()}",
+    )
+
+    # Check mat_b stride requirements
+    torch._check(
+        is_col_major(mat_b.get_stride()) or has_zero_dim(mat_b.get_size()),
+        lambda: f"mat_b must be col_major, got stride {mat_b.get_stride()}",
+    )
Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,7 @@ def mark_nodes_dislike_padding(`
`204`	`204`	`aten.convolution,`
`205`	`205`	`aten.convolution_backward,`
`206`	`206`	`aten._scaled_mm,`
	`207`	`+ aten._scaled_grouped_mm,`
`207`	`208`	`]`
`208`	`209`	`)`
`209`	`210`	`# what's a better way to collect the reduction ops?`