pytorch
diff --git a/‎test/inductor/test_cutlass_backend.py
Lines changed: 156 additions & 0 deletions b/‎test/inductor/test_cutlass_backend.py
Lines changed: 156 additions & 0 deletions
diff --git a/‎test/inductor/test_cutlass_evt.py
Lines changed: 82 additions & 28 deletions b/‎test/inductor/test_cutlass_evt.py
Lines changed: 82 additions & 28 deletions
@@ -62,6 +62,36 @@ def _get_path_without_sccache() -> str:
     return ":".join(path_envs)
 
 
+un_ops_under_test = [torch.relu]
+bin_ops_under_test = [torch.add, torch.mul, torch.sub, torch.div]
+
+evt_all_ops = parametrize(
+    "op", un_ops_under_test + bin_ops_under_test, name_fn=lambda f: f.__name__
+)
+
+evt_bin_ops = parametrize("op", bin_ops_under_test, name_fn=lambda f: f.__name__)
+
+
+def gen_args(op, shape):
+    if op in bin_ops_under_test:
+        return (torch.rand(*shape, device="cuda:0").half(),)
+    else:
+        return ()
+
+
+use_evt_config = config.patch(
+    {
+        "max_autotune": True,
+        "max_autotune_gemm_backends": "CUTLASS",
+        "cuda.cutlass_max_profiling_configs": 1,
+        "autotune_fallback_to_aten": False,
+        "benchmark_epilogue_fusion": False,
+        "cuda.cutlass_tma_only": True,  # EVT doesn't support benchmark fusion yet
+        "cuda.cutlass_epilogue_fusion_enabled": True,
+    }
+)
+
+
 @instantiate_parametrized_tests
 class TestCutlassBackend(TestCase):
     def setUp(self):
@@ -91,6 +121,22 @@ def tearDown(self):
         super().tearDown()
         clear_inductor_caches()
 
+    def run_evt_test(self, model, op, shape, num_fusions=1):
+        M, N = shape
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = model.cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"],
+            num_fusions,
+        )
+        torch.testing.assert_close(result, ref_result)
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_threshold(self):
@@ -1316,6 +1362,33 @@ def forward(self, B):
         ):
             _ = torch.compile(model)(B)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @use_evt_config
+    def test_evt_flexible_layout(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                return (A @ B).relu()
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda().half()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+                "autotune_fallback_to_aten": False,
+            }
+        ):
+            _ = torch.compile(model)(B)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_filtered_ops_cache(self):
@@ -1359,6 +1432,89 @@ def test_compilation_time(self):
             _ = torch.compile(torch.mm)(A, B)
         self.assertTrue(time.time() - start_time < 50)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_fusions_basic(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                res = (a @ b).relu()  # add extra activation to not hit addmm path
+                return op(res, *extra_args)
+
+        self.run_evt_test(TestModel(), op, (1024, 512))
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_bin_ops
+    def test_evt_broadcasting(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return acc, op(acc.relu(), *extra_args)
+
+        M = 1024
+        N = 512
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+        torch.testing.assert_close(result, ref_result)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_mixed_dtypes(self, op):
+        pass
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_multi_op(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return torch.add(op(acc.relu(), *extra_args).relu(), *extra_args)
+
+        self.run_evt_test(TestModel(), op, (1024, 512))
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_multi_output(self, op):
+        pass
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    def test_evt_return_accumulator(self):
+        op = torch.add
+
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                acc = a @ b
+                return acc, op(acc.relu(), *extra_args)
+
+        M = 1024
+        N = 512
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        torch.testing.assert_close(result, ref_result)
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
 
@@ -147,22 +147,25 @@ def inner_fn_buf4(index):
                     MockSchedulerNode(buf3),
                     MockSchedulerNode(buf4, last_usage=OrderedSet(["buf3"])),
                 ],
+                OrderedSet([]),
             )
-        self.assertExpectedInline(reads, """['buf0', 'buf1', 'buf2']""")
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
         self.assertExpectedInline(writes, """['buf0', 'buf3', 'buf4']""")
         self.assertExpectedInline(
-            renames, """{'buf0': 'accum', 'buf3': 'tmp_1', 'buf4': 'tmp_2'}"""
+            renames,
+            """{'accum': 'buf0', 'tmp_0': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'D': 'buf3', 'tmp_3': 'buf4'}""",
         )
         self.assertExpectedInline(
             code,
             """\
 def fn(accum, buf1, buf2):
-    D = accum # cutlass evt requirement
-    tmp_0 = accum * buf1
-    tmp_1 = tmp_0 + buf2
-    tmp_2 = accum + tmp_1
+    tmp_0 = accum
+    tmp_1 = tmp_0 * buf1
+    tmp_2 = tmp_1 + buf2
+    D = tmp_2 # cutlass evt requirement
+    tmp_3 = tmp_0 + D
 
-return D, tmp_1, tmp_2""",
+return tmp_0, D, tmp_3""",
         )
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
@@ -201,7 +204,9 @@ def inner_fn_buf4(index):
             result = None
             try:
                 CutlassEVTCodegen.ir_to_evt_python_code(
-                    "buf0", [MockSchedulerNode(buf3), MockSchedulerNode(buf4)]
+                    "buf0",
+                    [MockSchedulerNode(buf3), MockSchedulerNode(buf4)],
+                    OrderedSet([]),
                 )
             except NotImplementedError as e:
                 result = e
@@ -251,23 +256,26 @@ def inner_fn_buf4(index):
                     MockSchedulerNode(buf3),
                     MockSchedulerNode(buf4, last_usage=OrderedSet(["buf0"])),
                 ],
+                OrderedSet([]),
             )
-        self.assertExpectedInline(reads, """['buf0', 'buf1', 'buf2']""")
-        self.assertExpectedInline(writes, """['buf3', 'buf4']""")
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
+        self.assertExpectedInline(writes, """['buf0', 'buf3', 'buf4']""")
         self.assertExpectedInline(
-            renames, """{'buf3': 'D', 'buf4': 'tmp_3', 'buf0': 'accum'}"""
+            renames,
+            """{'accum': 'buf0', 'tmp_0': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'D': 'buf3', 'tmp_4': 'buf4'}""",
         )
         self.assertExpectedInline(
             code,
             """\
 def fn(accum, buf1, buf2):
-    tmp_0 = accum * buf1
-    tmp_1 = tmp_0 + buf2
-    D = tmp_1 # cutlass evt requirement
-    tmp_2 = D * D
-    tmp_3 = accum + tmp_2
-
-return D, tmp_3""",
+    tmp_0 = accum
+    tmp_1 = tmp_0 * buf1
+    tmp_2 = tmp_1 + buf2
+    D = tmp_2 # cutlass evt requirement
+    tmp_3 = D * D
+    tmp_4 = tmp_0 + tmp_3
+
+return tmp_0, D, tmp_4""",
         )
 
     @unittest.skipIf(not SM90OrLater, "need sm_90")
@@ -305,13 +313,15 @@ def inner_fn_buf4(index):
                 "buf0",
                 [
                     MockSchedulerNode(buf3),
-                    MockSchedulerNode(buf4, last_usage=OrderedSet(["buf0"])),
+                    MockSchedulerNode(buf4),
                 ],
+                OrderedSet(["buf0"]),
             )
-        self.assertExpectedInline(reads, """['buf0', 'buf1', 'buf2']""")
+        self.assertExpectedInline(reads, """['buf1', 'buf2']""")
         self.assertExpectedInline(writes, """['buf3', 'buf4']""")
         self.assertExpectedInline(
-            renames, """{'buf3': 'D', 'buf4': 'tmp_2', 'buf0': 'accum'}"""
+            renames,
+            """{'accum': 'buf0', 'buf1': 'buf1', 'buf2': 'buf2', 'D': 'buf3', 'tmp_2': 'buf4'}""",
         )
         self.assertExpectedInline(
             code,
@@ -338,13 +348,9 @@ def test_example_tensor_creation(self):
         col_major_buf1 = MockComputedBuffer(
             "buf1", None, torch.float32, (3, 2, 1), (1, 3, 0)
         )
-        read_names = ["buf0"]
-        write_names = ["buf1"]
-        buffer_renames = {"buf0": "acc"}
+        buffer_renames = {"buf0": "buf0", "buf1": "buf1", "acc": "buf0"}
         name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
-        result = create_example_tensors(
-            read_names, write_names, buffer_renames, name_to_buffer
-        )
+        result = create_example_tensors(buffer_renames, name_to_buffer)
         self.assertEqual(result["acc"].shape, (3, 4, 1))
         self.assertEqual(result["acc"].stride, (4, 1, 0))
         self.assertEqual(
@@ -360,7 +366,10 @@ def test_example_tensor_creation(self):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_evt_argument_codegen(self):
-        epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS)
+        from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+        cuda_arch = int(get_cuda_arch())  # type: ignore[arg-type]
+        epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS, cuda_arch)
 
         self.assertExpectedInline(
             _render_argument_type(
@@ -388,6 +397,51 @@ def test_evt_argument_codegen(self):
 """,
         )
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_evt_argument_codegen_return_accumulator(self):
+        from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+        code = """
+def fn(accum, bias):
+    E = accum
+    D = E + bias
+    return D, E
+"""
+        example_tensors = {
+            "accum": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+            "bias": BIAS,
+            # "beta": 0.5, TODO: mlazos support scalars
+            # "alpha": 0.5, TODO: mlazos support scalars
+            "D": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+            "E": CutlassTensor(
+                element=DataType.f32, shape=(M, N), layout_tag=LayoutType.RowMajor
+            ),
+        }
+
+        cuda_arch = int(get_cuda_arch())  # type: ignore[arg-type]
+        epilogue_functor = _trace(code, example_tensors, cuda_arch)
+
+        self.assertExpectedInline(
+            _render_argument_type(
+                epilogue_functor, _create_mock_buffer_name_map(example_tensors)
+            ),
+            """\
+{ /* thread */
+        { /* E */
+          {}, /* accum */
+          {/* ptr_aux */ (float*) E, /* dAux */ {2048, _1{}, _0{}}}, /* E */
+        },
+        {/* ptr_col */ (float*) bias, /* null_default */ float(0), /* dCol */ {}}, /* bias */
+        {}, /* compute_0 */
+      }
+""",
+        )
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_evt_codegen(self):