pytorch
diff --git a/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 5 additions & 2 deletions b/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/cuda/gemm_template.py
Lines changed: 122 additions & 18 deletions b/‎torch/_inductor/codegen/cuda/gemm_template.py
Lines changed: 122 additions & 18 deletions
diff --git a/‎torch/_inductor/config.py
Lines changed: 3 additions & 0 deletions b/‎torch/_inductor/config.py
Lines changed: 3 additions & 0 deletions
@@ -224,8 +224,11 @@ def _can_fuse_epilogue_impl(
         elif node_to_fuse.is_reduction():
             why(f"{node_name} is a reduction which is not yet supported by EVT")
             return False
-        elif not config.epilogue_fusion:
-            why("epilogue fusion is not enabled")
+        elif (
+            not config.cuda.cutlass_epilogue_fusion_enabled
+            or not config.epilogue_fusion
+        ):
+            why("cutlass epilogue fusion is not enabled")
             return False
 
         try:
 
@@ -8,6 +8,8 @@
 from abc import ABC, abstractmethod
 from typing import Any, Optional, Union
 
+import torch
+from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.select_algorithm import create_inputs_key
 from torch._inductor.utils import clear_on_fresh_inductor_cache
 
@@ -22,21 +24,26 @@
     Layout,
     ReinterpretView,
 )
-from ...utils import is_dynamic
+from ...utils import is_dynamic, OrderedSet
 from ...virtualized import V
 from ..common import IndentedBuffer
 from . import cutlass_utils
 from .cuda_kernel import CUDATemplateKernel
 from .cuda_template import CUTLASSTemplate
 from .cutlass_presets import gen_cutlass_presets
+from .cutlass_python_evt import CutlassEVTCodegen
+from .cutlass_utils import torch_dtype_to_cutlass_type
 
 
+GemmOperation = Any
+
 log = logging.getLogger(__name__)
 
 # Jinja template for GEMM Kernel, used by the CUTLASSGemm3xTemplate class below.
 GEMM_TEMPLATE_CUTLASS_3X = r"""
 {{template.header().getvalue()}}
 {{template.globals().getvalue()}}
+{{epilogue_visitor_tree}}
 {{instance_definition}}
 // When workspace_size is not a nullptr, populates requested workspace_size and returns.
 // Otherwise, computes the Gemm kernel using the given workspace ptr.
@@ -495,7 +502,8 @@ def _set_bias_layout_and_alignment(
     @abstractmethod
     def _define_gemm_instance(
         self,
-        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
     ) -> tuple[str, str]:
         raise NotImplementedError
 
@@ -965,6 +973,7 @@ def render(  # type: ignore[override]
         kernel: CUDATemplateKernel,
         op: "cutlass_gemm_op.GemmOperation" = None,  # type: ignore[name-defined]  # noqa: F821
         template_buffer_node: Optional[CUDATemplateBuffer] = None,
+        epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
         **kwargs,
     ) -> str:
         """
@@ -995,6 +1004,11 @@ def render(  # type: ignore[override]
             "op argument is required and has to be an instance of GemmOperation"
         )
 
+        if epilogue_nodes and not self._has_tma_epilogue(op):
+            raise NotImplementedError(
+                "Non-TMA epilogue visitor tree is not supported in Cutlass."
+            )
+
         assert len(self.input_nodes) >= 2 and self.output_node is not None
         X, W = self.input_nodes[0], self.input_nodes[1]
         for input_node in self.input_nodes:
@@ -1017,15 +1031,7 @@ def render(  # type: ignore[override]
             input_reorder = self.input_reorder
         else:
             input_reorder = None
-        kernel_call_signature = kernel.def_kernel(
-            inputs=inputs,  # type: ignore[arg-type]
-            outputs=[Y],
-            names_str=names_str,
-            input_reorder=input_reorder,
-            epilogue_inputs=[],  # TODO mlazos: will be filled in in https://github.com/pytorch/pytorch/pull/150907
-            epilogue_outputs=[],  # TODO mlazos: will be filled in in https://github.com/pytorch/pytorch/pull/150907
-        )
-        test_call_statement = self.test_call_statement(kernel, inputs, names_str)
+
         # The layouts might have changed between autotuning and this call if they were FlexibleLayout
         # we need to adapt, which might lead to suboptimal performance.
         op = self.fix_op_layout(op, X, W, Bias, Y)
@@ -1040,7 +1046,6 @@ def render(  # type: ignore[override]
 
         argument_template, epilogue_template = self._get_template_args(op)
         should_swap_xw: bool = False
-        epilogue_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
         if Bias is not None and self._has_tma_epilogue(op):
             if (
                 op.epilogue_schedule
@@ -1051,7 +1056,45 @@ def render(  # type: ignore[override]
                 op = self.swap_XW(op)
                 should_swap_xw = True
 
-        instance_definition, instance_type = self._define_gemm_instance(op)
+        if epilogue_nodes:
+            evt_read_names, evt_write_names, buffer_renames, evt_py_code = (
+                CutlassEVTCodegen.ir_to_evt_python_code(Y.get_name(), epilogue_nodes)
+            )
+            read_names = OrderedSet(evt_read_names) - OrderedSet(evt_write_names)
+            write_names = OrderedSet(evt_write_names)
+
+            name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+            epilogue_inputs = [name_to_buffer[name] for name in read_names]
+            epilogue_outputs = [name_to_buffer[name] for name in write_names]
+
+            evt_name, evt_args, evt_code = self._render_evt(
+                op,
+                evt_py_code,
+                evt_read_names,
+                evt_write_names,
+                buffer_renames,
+                Y.get_layout().dtype,
+                W.get_layout().dtype,
+            )
+        else:
+            evt_name = None
+            epilogue_inputs = []
+            epilogue_outputs = []
+            evt_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
+            evt_code = ""
+
+        kernel_call_signature = kernel.def_kernel(
+            inputs=inputs,  # type: ignore[arg-type]
+            outputs=[Y],
+            epilogue_inputs=epilogue_inputs,
+            epilogue_outputs=epilogue_outputs,
+            names_str=names_str,
+            input_reorder=input_reorder,
+        )
+
+        test_call_statement = self.test_call_statement(kernel, inputs, names_str)
+
+        instance_definition, instance_type = self._define_gemm_instance(op, evt_name)
 
         options = dict(
             alpha=self.alpha,
@@ -1069,9 +1112,10 @@ def render(  # type: ignore[override]
             instance_definition=instance_definition,
             instance_type=instance_type,
             input_reorder=self.input_reorder,
-            epilogue_args=epilogue_args,
+            epilogue_args=evt_args,
             test_call_statement=test_call_statement,
             op_conf_name=op.configuration_name(),
+            epilogue_visitor_tree=evt_code,
         )
         options.update(dict(zip(extra_names, extra_inputs)))
         res = self._template_from_string(self._get_template()).render(**options)
@@ -1106,8 +1150,25 @@ def test_call_statement(
         ]
         return f"{kernel.kernel_name}({', '.join(arguments)}, M, N, K, B, lda, ldb, ldc, ldd, swizzle, workspace_size_ptr, (uint8_t*)workspace_data.get(), 0);"  # noqa: B950
 
+    def _render_evt(
+        self,
+        op: GemmOperation,
+        evt_py_code: str,
+        read_names: list[str],
+        write_names: list[str],
+        buffer_renames: dict[str, str],
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+        raise NotImplementedError("_render_evt in CUTLASSGemmTemplate not implemented")
+
 
 class CUTLASS3xGemmTemplate(CUTLASSGemmTemplate):
+    """
+    CUTLASS 3x GEMM Template, which is used to generate CUTLASS GEMM kernels
+    including those which allow flexible fusions with epilogues.
+    """
+
     def __init__(
         self,
         input_nodes: list[Buffer],
@@ -1239,6 +1300,43 @@ def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
                 return False
         return True
 
+    def _render_evt(
+        self,
+        op: GemmOperation,
+        evt_py_code: str,
+        read_names: list[str],
+        write_names: list[str],
+        buffer_renames: dict[str, str],
+        output_dtype: torch.dtype,
+        accumulator_dtype: torch.dtype,
+    ) -> tuple[str, str, str]:  # type: ignore[name-defined]  # noqa: F821
+        from .cutlass_lib_extensions.evt_extensions import create_example_tensors, trace
+
+        name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
+
+        acc_dtype = torch_dtype_to_cutlass_type(accumulator_dtype)
+        output_dtype = torch_dtype_to_cutlass_type(output_dtype)
+        evt_name, evt_args, evt_code = trace(
+            evt_py_code,
+            create_example_tensors(
+                read_names,
+                write_names,
+                buffer_renames,
+                name_to_buffer,  # type: ignore[arg-type]
+            ),
+            acc_dtype,
+            output_dtype,
+            op.tile_description,  # type: ignore[attr-defined]
+            op.epilogue_schedule,  # type: ignore[attr-defined]
+            name_to_buffer,  # type: ignore[arg-type]
+        )
+
+        return (
+            evt_name,
+            evt_args,
+            evt_code,
+        )
+
     def _shape_match(
         self,
         op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
@@ -1282,7 +1380,8 @@ def _set_bias_layout_and_alignment(
 
     def _define_gemm_instance(
         self,
-        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
     ) -> tuple[str, str]:
         """Defines and renders the Cutlass / CUDA C++ code for a given GEMM operation instance.
 
@@ -1298,15 +1397,18 @@ def _define_gemm_instance(
                              code (render) and the second part is the string that specifies the operation type.
         """
         assert cutlass_utils.try_import_cutlass()
-        import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
 
-        emitter = cutlass_gemm_op.EmitGemmUniversal3xInstance()
+        from .cutlass_lib_extensions import gemm_operation_extensions as gemm_extensions
+
+        emitter = gemm_extensions.EmitGemmUniversal3xInstanceWithEVT(evt_name=evt_name)
+
         if not hasattr(op, "epilogue_functor") or not isinstance(
             op.epilogue_functor, enum.Enum
         ):
             op = copy.deepcopy(op)
             op.epilogue_functor = cutlass_lib.EpilogueFunctor.LinearCombination
+
         op_def = emitter.emit(op)
         pattern = re.compile(r"\s*struct\s(.*?)\s:")
         decl = [line for line in op_def.split("\n") if "struct " in line][-1]
@@ -1318,6 +1420,7 @@ def _define_gemm_instance(
         if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
             op_def += f"\n  using {op_type}_device_type = cutlass::gemm::device::GemmUniversalAdapter<{op_type}>;\n"
             op_type = f"{op_type}_device_type"
+
         return op_def, op_type
 
     def _get_extra_inputs_and_names(
@@ -1564,7 +1667,8 @@ def _set_bias_layout_and_alignment(
 
     def _define_gemm_instance(
         self,
-        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+        op: GemmOperation,
+        evt_name: Optional[str] = None,
     ) -> tuple[str, str]:
         """Defines and renders the Cutlass / CUDA C++ code for a given GEMM operation instance.
 
 
@@ -1332,6 +1332,9 @@ class cuda:
     # The L2 swizzle values to consider when profiling CUTLASS configs in max_autotune.
     cutlass_max_profiling_swizzle_options: list[int] = [1, 2, 4]
 
+    # Whether to use CUTLASS EVT for epilogue fusion
+    cutlass_epilogue_fusion_enabled = False
+
     # Path to CUDA NVCC.
     # NVCC search order:
     # 1) cuda_cxx set in this config