pytorch
diff --git a/‎torch/_inductor/kernel/bmm.py
Lines changed: 10 additions & 0 deletions b/‎torch/_inductor/kernel/bmm.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎torch/_inductor/kernel/mm.py
Lines changed: 41 additions & 7 deletions b/‎torch/_inductor/kernel/mm.py
Lines changed: 41 additions & 7 deletions
diff --git a/‎torch/_inductor/kernel/mm_scaled.py
Lines changed: 10 additions & 0 deletions b/‎torch/_inductor/kernel/mm_scaled.py
Lines changed: 10 additions & 0 deletions
@@ -168,6 +168,16 @@ def may_require_contiguous(t, meta_t):
 
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
 
+    log.info(
+        "Tuned aten.bmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        m,
+        n,
+        k,
+        mat1.get_dtype(),
+        mat2.get_dtype(),
+        layout,
+    )
+
     # options to tune from
     choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
     if use_triton_template(layout):
 
@@ -60,7 +60,8 @@
 mm_template = TritonTemplate(
     name="mm",
     grid=mm_grid,
-    source=r"""
+    source=(
+        r"""
 {{def_kernel("A", "B")}}
     M = {{size("A", 0)}}
     N = {{size("B", 1)}}
@@ -125,11 +126,11 @@
     # inductor generates a suffix
     {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
 """
-    if torch.version.hip is None
-    # FIXME: To get around rocm failures like https://github.com/pytorch/pytorch/actions/runs/13123783322/job/36617154943
-    # The only difference between the two templates is M >= BLOCK_M and N >= BLOCK_N checking.
-    # See more details in https://github.com/pytorch/pytorch/pull/146293
-    else r"""
+        if torch.version.hip is None
+        # FIXME: To get around rocm failures like https://github.com/pytorch/pytorch/actions/runs/13123783322/job/36617154943
+        # The only difference between the two templates is M >= BLOCK_M and N >= BLOCK_N checking.
+        # See more details in https://github.com/pytorch/pytorch/pull/146293
+        else r"""
 {{def_kernel("A", "B")}}
     M = {{size("A", 0)}}
     N = {{size("B", 1)}}
@@ -193,7 +194,8 @@
 
     # inductor generates a suffix
     {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
-""",
+"""
+    ),
 )
 
 persistent_tma_mm_template = TritonTemplate(
@@ -357,6 +359,16 @@ def tuned_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
     name = "mm"
 
+    log.info(
+        "Tuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        m,
+        n,
+        k,
+        mat1.get_dtype(),
+        mat2.get_dtype(),
+        layout,
+    )
+
     aten_layout = layout
     if not use_max_autotune():
         aten_layout = FlexibleLayout(
@@ -472,6 +484,17 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=torch.int32
     )
+
+    log.info(
+        "Tuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        m,
+        n,
+        k,
+        mat1.get_dtype(),
+        mat2.get_dtype(),
+        layout,
+    )
+
     static_shape, is_nonzero = _is_static_problem(layout)
     use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
 
@@ -516,6 +539,17 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     ordered_kwargs_for_cpp_kernel = ("beta", "alpha")
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     static_shape, is_nonzero = _is_static_problem(layout)
+
+    log.info(
+        "Tuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        m,
+        n,
+        k,
+        mat1.get_dtype(),
+        mat2.get_dtype(),
+        layout,
+    )
+
     if (not is_nonzero) or (not use_max_autotune()):
         # Use a FlexibleLayout if we are not autotuning.
         # This allows padding strides for the output.
 
@@ -509,6 +509,16 @@ def tuned_scaled_mm(
         mat_a, mat_b, layout=layout, out_dtype=out_dtype
     )
 
+    log.info(
+        "Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s",
+        m,
+        n,
+        k,
+        mat_a.get_dtype(),
+        mat_b.get_dtype(),
+        layout,
+    )
+
     check_supported_striding(mat_a, mat_b)
 
     scale_a, scale_b = realize_inputs(scale_a, scale_b)