pytorch
diff --git a/‎test/inductor/test_subgraph_choice.py
Lines changed: 102 additions & 28 deletions b/‎test/inductor/test_subgraph_choice.py
Lines changed: 102 additions & 28 deletions
diff --git a/‎torch/_inductor/codegen/subgraph.py
Lines changed: 19 additions & 10 deletions b/‎torch/_inductor/codegen/subgraph.py
Lines changed: 19 additions & 10 deletions
diff --git a/‎torch/_inductor/kernel/mm.py
Lines changed: 1 addition & 6 deletions b/‎torch/_inductor/kernel/mm.py
Lines changed: 1 addition & 6 deletions
@@ -1,23 +1,36 @@
 # Owner(s): ["module: inductor"]
 import functools
 import unittest
+from unittest import mock
+from unittest.mock import MagicMock
 
 import torch
 from torch._dispatch.python import enable_python_dispatcher
 from torch._inductor.codegen.subgraph import SubgraphTemplate
 from torch._inductor.decomposition import select_decomp_table
-from torch._inductor.ir import Buffer, FixedLayout
+from torch._inductor.ir import Buffer, FixedLayout, FlexibleLayout
 from torch._inductor.lowering import register_lowering
-from torch._inductor.select_algorithm import (
-    AlgorithmSelectorCache,
-    autotune_select_algorithm,
-)
+from torch._inductor.select_algorithm import autotune_select_algorithm
 from torch._inductor.test_case import run_tests, TestCase
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import skipIfXpu, TEST_WITH_ROCM
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
 
+def decomposeK(a, b, kPartitions):
+    m = a.shape[0]
+    n = b.shape[1]
+    k = a.shape[1]
+
+    B = k // kPartitions
+    a_reshaped = torch.permute(a.reshape(m, B, kPartitions), (1, 0, 2))
+    b_reshaped = b.reshape(B, kPartitions, n)
+    result = torch.bmm(a_reshaped, b_reshaped, out_dtype=torch.float32)
+    result_fp32 = result.to(torc
8000
h.float32)
+    reduced_buf = torch.sum(result_fp32, 0)
+    return reduced_buf.to(a.dtype)
+
+
 class TestSubgraphChoice(TestCase):
     def setUp(self):
         super().setUp()
@@ -34,6 +47,8 @@ def test_subgraph_decompose_k(self):
         from torch._inductor.kernel.mm import aten_mm
         from torch._inductor.kernel.mm_common import mm_args
 
+        mat1_shape, mat2_shape = (32, 4096), (4096, 32)
+
         @torch.library.custom_op("mylib::matmul_decompose", mutates_args={})
         def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
             return a @ b
@@ -42,28 +57,12 @@ def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
         def _(a, b):
             return a @ b
 
-        def decomposeK(a, b, kPartitions):
-            m = a.shape[0]
-            n = b.shape[1]
-            k = a.shape[1]
-
-            B = k // kPartitions
-            a_reshaped = torch.permute(a.reshape(m, B, kPartitions), (1, 0, 2))
-            b_reshaped = b.reshape(B, kPartitions, n)
-            result = torch.bmm(a_reshaped, b_reshaped, out_dtype=torch.float32)
-            result_fp32 = result.to(torch.float32)
-            reduced_buf = torch.sum(result_fp32, 0)
-            return reduced_buf.to(a.dtype)
-
-        mat1_shape, mat2_shape = (32, 4096), (4096, 32)
-
         @register_lowering(torch.ops.mylib.matmul_decompose)
         def _(a, b):
             _, _, _, layout, mat1, mat2 = mm_args(a, b)
 
             choices = [aten_mm.bind((mat1, mat2), layout)]
 
-            # TODO (PaulZhang12): Once decomposeK lands in Inductor, move this
             kPartitions = 256
             with enable_python_dispatcher():
                 decompositions = select_decomp_table()
@@ -77,15 +76,10 @@ def _(a, b):
                     ),
                 )
 
-            mat1_tensor, mat2_tensor = (
-                AlgorithmSelectorCache.benchmark_example_value(mat1),
-                AlgorithmSelectorCache.benchmark_example_value(mat2),
-            )
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
                 input_nodes=(mat1, mat2),
                 layout=layout,
-                example_inputs=[mat1_tensor, mat2_tensor],
             )
 
             # Test benchmarking against aten
@@ -112,8 +106,88 @@ def func(mat1, mat2):
         res = compiled_func(a_in, b_in)
 
         # Check same results of compiled result and regular torch.mm
-        # Relax precision as decomposeK does first accumulation in fp16
-        torch.testing.assert_close(res, a_in @ b_in, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(res, a_in @ b_in, atol=1e-2, rtol=1e-2)
+
+    @skipIfXpu
+    @unittest.skipIf(TEST_WITH_ROCM, "decompose_k not supported on ROCm")
+    def test_subgraph_freeze_layout(self):
+        from torch._inductor.kernel.mm_common import mm_args
+
+        M, N, K = (4, 128, 14240)
+        a_in = torch.randn(
+            (M, K), dtype=torch.bfloat16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+        b_in = torch.randn(
+            (K, N), dtype=torch.bfloat16, device=torch.device(f"{GPU_TYPE}:0")
+        )
+
+        @torch.library.custom_op("mylib::matmul_decompose_padding", mutates_args={})
+        def matmul_decompose(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a @ b
+
+        @matmul_decompose.register_fake
+        def _(a, b):
+            return a @ b
+
+        @register_lowering(torch.ops.mylib.matmul_decompose_padding)
+        def _(a, b):
+            _, _, _, layout, mat1, mat2 = mm_args(a, b)
+            mat1_layout = mat1.layout
+            assert isinstance(mat1_layout, FlexibleLayout)
+            mat1_stride = mat1_layout.stride
+
+            choices = []
+
+            kPartitions = 2
+            with enable_python_dispatcher():
+                decompositions = select_decomp_table()
+
+                decompose_k_subgraph_template = SubgraphTemplate(
+                    name="decompose_k_mm",
+                    make_fx_graph=make_fx(
+                        functools.partial(decomposeK, kPartitions=kPartitions),
+                        decompositions,
+                    ),
+                )
+
+            decompose_k_subgraph_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+            )
+
+            choice = choices[0]
+            assert isinstance(mat1.layout, FixedLayout)
+
+            # Creating the subgraph choice should have frozen the layout
+            # We ensure padding so the stride should differ
+            assert mat1.layout.stride != mat1_stride
+
+            for example_stride, layout_stride in zip(
+                choice.example_inputs[0].stride(), mat1.layout.stride
+            ):
+                # Example inputs should have same stride as current layout
+                assert example_stride == layout_stride
+
+            return autotune_select_algorithm(
+                "test_subgraph_choice", choices, [a, b], layout
+            )
+
+        def func(mat1, mat2):
+            return torch.ops.mylib.matmul_decompose_padding((mat1 + 1.0), mat2)
+
+        with mock.patch("torch._inductor.ir.V.get_current_node") as get_node_mock:
+            node_mock = MagicMock()
+            node_mock.meta = {"dislike_padding": False}
+            get_node_mock.return_value = node_mock
+
+            compiled_func = torch.compile(func, mode="max-autotune", dynamic=False)
+
+            res = compiled_func(a_in, b_in)
+
+            # Check same results of compiled result and regular torch.mm
+            # Relax precision as decomposeK does first accumulation in fp16
+            torch.testing.assert_close(res, (a_in + 1.0) @ b_in, atol=1e-2, rtol=1e-2)
 
 
 if __name__ == "__main__":
 
@@ -5,7 +5,7 @@
 import torch
 from torch._inductor import ir
 from torch._inductor.codegen.common import KernelTemplate
-from torch._inductor.ir import Buffer, Layout
+from torch._inductor.ir import Buffer, ir_node_to_tensor, Layout
 from torch._inductor.runtime.benchmarking import benchmarker
 from torch._inductor.virtualized import V
 
@@ -25,12 +25,17 @@ def __init__(
         input_nodes: list[Buffer],
         layout: Layout,
         description: str,
-        gm: torch.fx.GraphModule,
-        example_inputs: list[Any],
+        make_fx_graph: Callable[..., Any],
     ) -> None:
         super().__init__(name, input_nodes, layout, description)
-        self.gm = gm
-        self.example_inputs = example_inputs
+
+        self.example_inputs = []
+        with V.fake_mode:
+            for inp in self.input_nodes:
+                inp.data.freeze_layout()  # `type: ignore[attr-defined]`
+                self.example_inputs.append(ir_node_to_tensor(inp))
+
+        self.gm = make_fx_graph(*self.example_inputs)
 
     def __str__(self) -> str:
         return f"SubgraphCaller({self.name})"
@@ -54,14 +59,21 @@ def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
             name=f"benchmark_{self.name}",
         )
 
+        for ar, example_inp in zip(args, self.example_inputs):
+            # Sanity check that args are same layout as example inputs
+            if isinstance(ar, torch.Tensor):
+                assert isinstance(example_inp, torch.Tensor)
+                assert ar.shape == example_inp.shape
+                assert ar.stride() == example_inp.stride()
+
         with V.set_graph_handler(bm_graph_lowering):
             # Don't bother autotuning on Triton here
             with inductor_config.patch(
                 max_autotune=False,
                 max_autotune_gemm=False,
                 max_autotune_gemm_backends="ATEN",
             ):
-                bm_graph_lowering.run(*self.example_inputs)
+                bm_graph_lowering.run(*args)
                 mod = bm_graph_lowering.compile_to_module()
                 bm_func = mod.call
 
@@ -139,7 +151,6 @@ def generate(  # type: ignore[override]
         self,
         input_nodes: list[Buffer],
         layout: Layout,
-        example_inputs: list[Any],
         **kwargs: Any,
     ) -> SubgraphChoiceCaller:
         """
@@ -154,13 +165,11 @@ def generate(  # type: ignore[override]
         Returns:
             SubgraphChoiceCaller: A callable object that can be used for autotuning
         """
-        gm = self.make_fx_graph(*example_inputs)
 
         return SubgraphChoiceCaller(
             name=self.name,
             input_nodes=input_nodes,
             layout=layout,
             description="",
-            gm=gm,
-            example_inputs=example_inputs,
+            make_fx_graph=self.make_fx_graph,
         )
@@ -23,7 +23,7 @@
 from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 from ..codegen.subgraph import SubgraphTemplate
-from ..ir import FlexibleLayout, ir_node_to_tensor, is_triton
+from ..ir import FlexibleLayout, is_triton
 from ..lowering import (
     add_layout_constraint,
     constrain_to_fx_strides,
@@ -698,15 +698,10 @@ def tuned_mm(mat1, mat2, *, layout=None):
                         ),
                     )
 
-                with V.fake_mode:
-                    mat1_tensor = ir_node_to_tensor(mat1)
-                    mat2_tensor = ir_node_to_tensor(mat2)
-
                 decompose_k_subgraph_template.maybe_append_choice(
                     choices,
                     input_nodes=(mat1, mat2),
                     layout=layout,
-                    example_inputs=[mat1_tensor, mat2_tensor],
                 )
 
     if is_nonzero and use_cutlass_template(layout, m, n, k):