pytorch
diff --git a/‎test/inductor/test_cutlass_backend.py
Lines changed: 25 additions & 0 deletions b/‎test/inductor/test_cutlass_backend.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 3 additions & 1 deletion b/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/_inductor/codegen/cuda/cuda_kernel.py
Lines changed: 3 additions & 0 deletions b/‎torch/_inductor/codegen/cuda/cuda_kernel.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cuda_template.py
Lines changed: 17 additions & 0 deletions b/‎torch/_inductor/codegen/cuda/cuda_template.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda/gemm_template.py
Lines changed: 8 additions & 7 deletions b/‎torch/_inductor/codegen/cuda/gemm_template.py
Lines changed: 8 additions & 7 deletions
diff --git a/‎torch/_inductor/codegen/cuda_combined_scheduling.py
Lines changed: 4 additions & 0 deletions b/‎torch/_inductor/codegen/cuda_combined_scheduling.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/_inductor/ir.py
Lines changed: 2 additions & 0 deletions b/‎torch/_inductor/ir.py
Lines changed: 2 additions & 0 deletions
@@ -1314,6 +1314,31 @@ def forward(self, B):
         ):
             _ = torch.compile(model)(B)
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
+    def test_evt_flexible_layout(self):
+        class TestModel(torch.nn.Module):
+            def forward(self, B):
+                A = torch.zeros_like(B)
+                return (A @ B).relu()
+
+        M = 1024
+        B = torch.randn(M, M).cuda().half()
+        model = TestModel().cuda()
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "benchmark_epilogue_fusion": False,  # does not support benchmark fusion yet
+                "max_autotune_gemm_backends": "CUTLASS",
+                "cuda.cutlass_max_profiling_configs": 20,
+                "autotune_fallback_to_aten": False,
+            }
+        ):
+            _ = torch.compile(model)(B)
+
+        self.assertEqual(torch._dynamo.utils.counters["inductor"]["cuda_epilogue_fusion_counter"], 1)
+
 
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
 
@@ -187,7 +187,6 @@ def _can_fuse_epilogue_impl(
         - bool: True if the given node can be fused with the epilogue, False otherwise.
 
         """
-
         why = WhyNoFuseNames(cuda_template_buffer.get_name(), node_to_fuse.get_name())
 
         ir_node_to_fuse = node_to_fuse.node
@@ -227,6 +226,9 @@ def _can_fuse_epilogue_impl(
         elif not config.epilogue_fusion:
             why("epilogue fusion is not enabled")
             return False
+        elif not cuda_template_buffer.supports_epilogue_fusion:
+            why("epilogue fusion is only supported for TMA-enabled gemm ops")
+            return False
 
         try:
             from torch._inductor.codegen.cuda.cutlass_python_evt import (
 
@@ -563,6 +563,7 @@ def __init__(
             tuple[CUDATemplateKernel, functools.partial[str]],
         ],
         bmreq: CUDABenchmarkRequest,
+        supports_epilogue_fusion: bool,
         template: "CUDATemplate",  # type: ignore[name-defined]
         info_kwargs: Optional[
             dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]
@@ -573,6 +574,7 @@ def __init__(
         self.category = category
         self.make_kernel_render = make_kernel_render
         self.bmreq = bmreq
+        self.supports_epilogue_fusion = supports_epilogue_fusion
         self.template = template
         self.info_kwargs = info_kwargs
 
@@ -629,6 +631,7 @@ def output_node(self) -> TensorBox:
                 inputs=self.input_nodes,
                 make_kernel_render=self.make_kernel_render,
                 workspace_size=self.bmreq.workspace_size,
+                supports_epilogue_fusion=self.supports_epilogue_fusion,
                 template=self.template,
             )
         )
@@ -24,6 +24,7 @@
 else:
     BaseSchedulerNode = Any
 
+GemmOperation = Any
 
 autotuning_log = getArtifactLogger(__name__, "autotuning")
 
@@ -61,6 +62,10 @@ def __init__(
         self.input_reorder = input_reorder
         self.layout = layout
 
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return False
+
     def generate(  # type: ignore[override]
         self,
         description,
@@ -122,10 +127,21 @@ def generate(  # type: ignore[override]
             source_code=code,
         )
 
+        # kwargs has "op" argument in case of CUTLASSGemmTemplate
+        op = kwargs["op"]
+        if not op:
+            supports_epilogue_fusion = False
+        else:
+            # epilogue fusion is only supported for TMA kernels
+            supports_epilogue_fusion = self.supports_epilogue_fusion(op)
+
         def make_kernel_render(
             template_node: CUDATemplateBuffer,
             epilogue_nodes: Optional[list[BaseSchedulerNode]] = None,
         ) -> tuple[CUDATemplateKernel, functools.partial[str]]:
+            assert supports_epilogue_fusion or not epilogue_nodes, (
+                "epilogue fusion is not supported for this kernel"
+            )
             kernel = CUDATemplateKernel(
                 kernel_name="KERNEL_NAME",
                 runtime_arg_info=self.get_runtime_arg_info(),
@@ -147,6 +163,7 @@ def make_kernel_render(
             self.output_node.get_layout(),
             make_kernel_render,
             bmreq,
+            supports_epilogue_fusion,
             self,
             kwargs,
             description,
 
@@ -826,9 +826,6 @@ def filter_op(
        
10000
 ):
             return None
 
-        if not self._has_tma_epilogue(op):
-            return None
-
         # Filter ops by alignment.
         if not self._alignment_match(op):
             log.debug(
@@ -989,7 +986,6 @@ def render(  # type: ignore[override]
             All inputs and their corresponding buffer addresses and names take precedence over previously
             passed inputs to the template at construction time. However, they should be layout compatible.
         """
-
         assert cutlass_utils.try_import_cutlass()
         import cutlass_library.gemm_operation as cutlass_gemm_op
         import cutlass_library.library as cutlass_lib
@@ -1038,6 +1034,10 @@ def render(  # type: ignore[override]
             # operand
             op.C.element = op.A.element
 
+            assert op.C.element == op.D.element, (
+                f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
+            )
+
         argument_template, epilogue_template = self._get_template_args(op)
         should_swap_xw: bool = False
         if Bias is not None and self._has_tma_epilogue(op):
@@ -1219,6 +1219,10 @@ def _has_tma_epilogue(  # noqa: F821 # type: ignore[arg-type,name-defined]
             result = epilogue_schedule_str.lower().startswith("tma")
         return result
 
+    @staticmethod
+    def supports_epilogue_fusion(op: GemmOperation) -> bool:
+        return CUTLASS3xGemmTemplate._has_tma_epilogue(op)
+
     def _are_inputs_layout_compatible(self, layouts: list[Layout]) -> bool:
         """
         Evaluates whether input layouts are compatible for General Matrix Multiply (GEMM).
@@ -1355,9 +1359,6 @@ def _set_bias_layout_and_alignment(
             op.C.element = cutlass_utils.torch_dtype_to_cutlass_type(
                 Bias.get_layout().dtype
             )
-            assert op.C.element == op.D.element, (
-                f"Expect C and D to have the same dtype, found {op.C.element} and {op.D.element}"
-            )
 
             # Bias layout
             bias_layout = CUTLASSGemmTemplate.cutlass_layout(Bias.get_layout())
 
@@ -60,6 +60,10 @@ def can_fuse_vertical(
     ) -> bool:
         if self._cuda_cpp_scheduling.can_fuse_vertical(node1, node2):
             return True
+        elif self._cuda_cpp_scheduling.is_cuda_cpp_template(
+            node1
+        ) or self._cuda_cpp_scheduling.is_cuda_cpp_template(node2):
+            return False
         return self._triton_scheduling.can_fuse_vertical(node1, node2)
 
     def can_fuse_horizontal(
 
@@ -4725,11 +4725,13 @@ def __init__(  # type: ignore[no-untyped-def]
         make_kernel_render,
         workspace_size: int,
         template: CUDATemplate,
+        supports_epilogue_fusion: bool,
     ) -> None:
         super().__init__(layout, inputs, make_kernel_render)
         # Global memory (in bytes) needed for this template.
         self.workspace_size = workspace_size
         self.template = template
+        self.supports_epilogue_fusion = supports_epilogue_fusion
 
     def get_workspace_size(self):  # type: ignore[no-untyped-def]
         return self.workspace_size if self.workspace_size is not None else 0