pytorch
diff --git a/‎test/inductor/test_cutlass_backend.py
Lines changed: 111 additions & 0 deletions b/‎test/inductor/test_cutlass_backend.py
Lines changed: 111 additions & 0 deletions
diff --git a/‎test/inductor/test_cutlass_evt.py
Lines changed: 4 additions & 1 deletion b/‎test/inductor/test_cutlass_evt.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 3 additions & 1 deletion b/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/_inductor/codegen/cuda/cuda_kernel.py
Lines changed: 11 additions & 8 deletions b/‎torch/_inductor/codegen/cuda/cuda_kernel.py
Lines changed: 11 additions & 8 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cuda_template.py
Lines changed: 17 additions & 0 deletions b/‎torch/_inductor/codegen/cuda/cuda_template.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
Lines changed: 2 additions & 2 deletions b/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cutlass_python_evt.py
Lines changed: 2 additions & 2 deletions b/‎torch/_inductor/codegen/cuda/cutlass_python_evt.py
Lines changed: 2 additions & 2 deletions
@@ -62,6 +62,34 @@ def _get_path_without_sccache() -> str:
     return ":".join(path_envs)
 
 
+un_ops_under_test = [torch.relu, torch.sigmoid, torch.tanh]
+bin_ops_under_test = [torch.add, torch.mul, torch.sub, torch.div]
+
+evt_all_ops = evt_bin_ops = parametrize(
+    "op", un_ops_under_test + bin_ops_under_test, name_fn=lambda f: f.__name__
+)
+
+
+def gen_args(op, shape):
+    if op in bin_ops_under_test:
+        return (torch.rand(*shape, device="cuda:0").half(),)
+    else:
+        return ()
+
+
+use_evt_config = config.patch(
+    {
+        "max_autotune": True,
+        "max_autotune_gemm_backends": "CUTLASS",
+        "cuda.cutlass_max_profiling_configs": 1,
+        "autotune_fallback_to_aten": False,
+        "benchmark_epilogue_fusion": False,
+        "cuda.cutlass_tma_only": True,  # EVT doesn't support benchmark fusion yet
+        "cuda.cutlass_epilogue_fusion_enabled": True,
+    }
+)
+
+
 @instantiate_parametrized_tests
 class TestCutlassBackend(TestCase):
     def setUp(self):
@@ -1316,6 +1344,35 @@ def forward(self, B):
         ):
             _ = torch.compile(model)(B)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    @config.patch(
+        {"benchmark_epilogue_fusion": False, "cuda.cutlass_tma_only": True}
+    )  # EVT doesn't support benchmark fusion yet
+    def test_evt_flexible_layout(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                return (A @ B).relu()
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 1,
+                "autotune_fallback_to_aten": False,
+            }
+        ):
+            _ = torch.compile(model)(B)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_filtered_ops_cache(self):
@@ -1359,6 +1416,60 @@ def test_compilation_time(self):
             _ = torch.compile(torch.mm)(A, B)
         self.assertTrue(time.time() - start_time < 50)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_fusions_basic(self, op):
+        class TestModel(torch.nn.Module):
+            def forward(self, a, b, extra_args):
+                res = (a @ b).relu()  # add extra activation to not hit addmm path
+                return op(res, *extra_args)
+
+        M = 16
+        N = 16
+        a = torch.ones(M, N).cuda().half()
+        b = torch.ones(N, N).cuda().half()
+        extra_args = gen_args(op, (M, N))
+        model = TestModel().cuda()
+
+        result = torch.compile(model)(a, b, extra_args)
+        ref_result = model(a, b, extra_args)
+
+        self.assertEqual(
+            torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1
+        )
+        torch.testing.assert_close(result, ref_result)
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_broadcasting(self):
+        pass
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_mixed_dtypes(self):
+        pass
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_multi_op(self):
+        pass
+
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_multi_output(self):
+        pass
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @use_evt_config
+    @evt_all_ops
+    def test_evt_return_accumulator(self):
+        pass
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
 
@@ -360,7 +360,10 @@ def test_example_tensor_creation(self):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_evt_argument_codegen(self):
-        epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS)
+        from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
+
+        cuda_arch = int(get_cuda_arch())  # type: ignore[arg-type]
+        epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS, cuda_arch)
 
         self.assertExpectedInline(
             _render_argument_type(
 
@@ -201,7 +201,6 @@ def _can_fuse_epilogue_impl(
         - bool: True if the given node can be fused with the epilogue, False otherwise.
 
         """
-
         why = WhyNoFuseNames(cuda_template_buffer.get_name(), node_to_fuse.get_name())
 
         ir_node_to_fuse = node_to_fuse.node
@@ -244,6 +243,9 @@ def _can_fuse_epilogue_impl(
         ):
             why("cutlass epilogue fusion is not enabled")
             return False
+        elif not cuda_template_buffer.supports_epilogue_fusion:
+            why("epilogue fusion is only supported for TMA-enabled gemm ops")
+            return False
 
         try:
             from torch._inductor.codegen.cuda.cutlass_python_evt import (
 
@@ -12,6 +12,7 @@
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.utils import Placeholder
+from torch.utils._sympy.value_ranges import ValueRanges
 
 
 if TYPE_CHECKING:
@@ -30,6 +31,7 @@
 from ...utils import sympy_product
 from ...virtualized import V
 from ..common import (
+    CSEVariable,
     IndentedBuffer,
     Kernel,
     OpOverrides,
@@ -238,7 +240,6 @@ def def_kernel(
         inputs: list[IRNode],
         outputs: list[IRNode],
         epilogue_inputs: list[IRNode],
-        epilogue_outputs: list[IRNode],
         names_str: str = "",
         input_reorder: Optional[list[int]] = None,
     ) -> str:
@@ -285,13 +286,6 @@ def def_kernel(
                 self.named_nodes[name] = node
                 self.args.output_buffers[node.get_name()] = name
 
-        for epilogue_output in epilogue_outputs:
-            if epilogue_output is not None:
-                self.named_nodes[epilogue_output.get_name()] = epilogue_output
-                self.args.output_buffers[epilogue_output.get_name()] = (
-                )
-
         arg_defs, *_ = self.args.cpp_argdefs()
 
         self.init_layout_args()
@@ -540,6 +534,12 @@ def row_or_column_stride(self, node: IRNode, default_value: int = 0) -> str:
                 f"At least 1 stride should be 1. Strides: {node.get_stride()=}"
             )
 
+    def load(self, name: str, index: Expr, mode: Any = None) -> CSEVariable:
+        """
+        Mock load function for memory planning to optimize allocations properly.
+        """
+        return self.create_cse_var(name, bounds=ValueRanges.unknown())
+
     def store(self, name: str, index: Expr, value: Any, mode: Any = None) -> None:
         """
         Mock store function for memory planning to optimize allocations properly.
@@ -570,6 +570,7 @@ def __init__(
             tuple[CUDATemplateKernel, functools.partial[str]],
         ],
         bmreq: CUDABenchmarkRequest,
+        supports_epilogue_fusion: bool,
         template: "CUDATemplate",  # type: ignore[name-defined]
         info_kwargs: Optional[
             dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]
@@ -580,6 +581,7 @@ def __init__(
         self.category = category
         self.make_kernel_render = make_kernel_render
         self.bmreq = bmreq
+        self.supports_epilogue_fusion = supports_epilogue_fusion
         self.template = template
         self.info_kwargs = info_kwargs
 
@@ -636,6 +638,7 @@ def output_node(self) -> TensorBox:
                 inputs=self.input_nodes,
                 make_kernel_render=self.make_kernel_render,
                 workspace_size=self.bmreq.workspace_size,
+                supports_epilogue_fusion=self.supports_epilogue_fusion,
                 template=self.template,
             )
         )
@@ -26,6 +26,7 @@
 else:
     BaseSchedulerNode = Any
 
+GemmOperation = Any
 
 autotuning_log = getArtifactLogger(__name__, "autotuning")
 
@@ -63,6 +64,10 @@ def __init__(
         self.input_reorder = input_reorder
         self.layout = layout
 
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return False
+
     def generate(  # type: ignore[override]
         self,
         description,
@@ -126,10 +131,21 @@ def generate(  # type: ignore[override]
             source_code=code,
         )
 
+        # kwargs has "op" argument in case of CUTLASSGemmTemplate
+        op = kwargs["op"]
+        if not op:
+            supports_epilogue_fusion = False
+        else:
+            # epilogue fusion is only supported for TMA kernels
+            supports_epilogue_fusion = self.supports_epilogue_fusion(op)
+
         def make_kernel_render(
             template_node: CUDATemplateBuffer,
             epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
         ) -> tuple[CUDATemplateKernel, functools.partial[str]]:
+            assert supports_epilogue_fusion or not epilogue_nodes, (
+                "epilogue fusion is not supported for this kernel"
+            )
             kernel = CUDATemplateKernel(
                 kernel_name=str(Placeholder.KERNEL_NAME),
                 runtime_arg_info=self.get_runtime_arg_info(),
@@ -151,6 +167,7 @@ def make_kernel_render(
             self.output_node.get_layout(),
             make_kernel_render,
             bmreq,
+            supports_epilogue_fusion,
             self,
             kwargs,
             description,
 
@@ -119,7 +119,7 @@ def trace(
     ) -> tuple[str, str, str]:
         cuda_arch = int(cuda_env.get_cuda_arch())  # type: ignore[arg-type]
         assert cuda_arch >= 90, "Only SM90+ is supported for EVT"
-        epilogue_functor = _trace(fn_src, example_tensors, **kwargs)
+        epilogue_functor = _trace(fn_src, example_tensors, cuda_arch, **kwargs)
         visitor = EpilogueFunctorVisitor(cuda_arch, epilogue_functor)
         fusion_callbacks = FusionCallbacks(visitor.graph, cuda_arch, emit_CD=False)
         collective_epilogue = CollectiveEpilogue(
@@ -138,7 +138,7 @@ def trace(
     # This is modified to enable directly passing the source code of the epilogue vs getting it from a bona-fide python function
     # The reason for this is that inspect.getsource does not work with functions defined at runtime via exec/eval
     def _trace(
-        fn_src: str, example_tensors: dict[str, CutlassTensor], **kwargs: Any
+        fn_src: str, example_tensors: dict[str, CutlassTensor], cc: int, **kwargs: Any
     ) -> EpilogueFunctor:
         class EpilogueFunctor(PythonASTFrontend):
             def __init__(self, cc: int, **kwargs: Any):
 
@@ -68,15 +68,15 @@ def relu(x0: str) -> str:
 
     @staticmethod
     def sigmoid(x0: str) -> str:
-        return CutlassEVTOpsMixIn._prefix_un_op("sigmoid", x0)
+        raise NotImplementedError("sigmoid is not supported in CUTLASS python evt")
 
     @staticmethod
     def sub(x0: str, x1: str) -> str:
         return CutlassEVTOpsMixIn._infix_bin_op("-", x0, x1)
 
     @staticmethod
     def tanh(x0: str) -> str:
-        return CutlassEVTOpsMixIn._prefix_un_op("tanh", x0)
+        raise NotImplementedError("tanh is not supported in CUTLASS python evt")
 
 
 class MockCutlassHandler(CutlassEVTOpsMixIn, WrapperHandler):