pytorch
diff --git a/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 4 additions & 3 deletions b/‎aten/src/ATen/native/cuda/Blas.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎torch/_inductor/graph.py
Lines changed: 0 additions & 1 deletion b/‎torch/_inductor/graph.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎torch/_inductor/kernel/mm_common.py
Lines changed: 2 additions & 1 deletion b/‎torch/_inductor/kernel/mm_common.py
Lines changed: 2 additions & 1 deletion
@@ -1533,7 +1533,7 @@ namespace {
           "D, arg ",
           arg_idx);
       TORCH_CHECK(
-          scale.is_contiguous(), "scale_a must be contiguous for arg ", arg_idx);
+          scale.is_contiguous(), "scale must be contiguous for arg ", arg_idx);
       TORCH_CHECK(
           scale.size(0) == mat.size(dim) * scale_multiplier,
           "scale must have the same length as mat for arg ",
@@ -1546,8 +1546,8 @@ namespace {
           "D for arg ",
           arg_idx);
       TORCH_CHECK(
-          scale.stride(1),
-          "scale_a must be contiguous in the last dimension for arg ",
+          scale.stride(1) == 1,
+          "scale must be contiguous in the last dimension for arg ",
           arg_idx);
       TORCH_CHECK(
           scale.size(0) == mat.size(0),
@@ -1611,6 +1611,7 @@ bool use_fast_accum) {
 
 
   TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+  TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet");
   TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
 
   if (offs.has_value()) {
 
@@ -204,7 +204,6 @@ def mark_nodes_dislike_padding(
             aten.convolution,
             aten.convolution_backward,
             aten._scaled_mm,
-            aten._scaled_grouped_mm,
         ]
     )
     # what's a better way to collect the reduction ops?
 
@@ -56,7 +56,8 @@ def persistent_mm_grid(M: int, N: int, meta: dict[str, Any], *, cdiv, min):
 
 
 @SymbolicGridFn
-def persistent_grouped_mm_grid(m, n, meta):
+def persistent_grouped_mm_grid(*args):
+    meta = args[-1]
     return (meta["NUM_SMS"], 1, 1)
Original file line number	Diff line number	Diff line change
`@@ -204,7 +204,6 @@ def mark_nodes_dislike_padding(`
`204`	`204`	`aten.convolution,`
`205`	`205`	`aten.convolution_backward,`
`206`	`206`	`aten._scaled_mm,`
`207`		`- aten._scaled_grouped_mm,`
`208`	`207`	`]`
`209`	`208`	`)`
`210`	`209`	`# what's a better way to collect the reduction ops?`