Update base for Update on "[Cutlass] Integrate EVT into CUDACPPScheduling"

mlazos · mlazos · commit 1f9858ce613f · 2025-05-02T14:00:44.000-07:00
Previously merged: * #151713 * #151405 * #150905 * #152306 * #152305 Allow epilogue nodes in cuda combined scheduling cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov [ghstack-poisoned]
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
@@ -105,6 +105,7 @@ def __init__(self, name_to_buffer):
 
         self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
         self.name_to_buffer = name_to_buffer
+        self.graph_inputs = dict()
         self.mutated_buffers = OrderedSet()
 
 
@@ -210,6 +211,64 @@ def inner_fn_buf4(index):
                 """Unsupported indexing for buf0 with index 200*i0 + 60000*i1 + i2 and strides [200, 60000, 1]""",
             )
 
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_py_codegen_broadcasting(self):
+        from torch._inductor.codegen.cuda.cutlass_python_evt import CutlassEVTCodegen
+        from torch._inductor.virtualized import V
+
+        size = (100, 300, 200)
+        buf0 = MockComputedBuffer("buf0", None, torch.float32, size)
+        buf1 = MockComputedBuffer("buf1", None, torch.float32, size)
+        buf2 = MockComputedBuffer("buf2", None, torch.float32, size)
+
+        # buf0 is acc
+        # buf1 is external
+        def inner_fn_buf3(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp1 = buf1.make_loader()(index)
+            tmp2 = buf2.make_loader()(index)
+            return tmp0 * tmp1 + tmp2
+
+        def inner_fn_buf4(index):
+            tmp0 = buf0.make_loader()(index)
+            tmp3 = buf3.make_loader()(index)
+            return tmp0 + tmp3 * tmp3
+
+        buf3 = MockComputedBuffer("buf3", inner_fn_buf3, torch.float32, size)
+        buf4 = MockComputedBuffer(
+            "buf4", inner_fn_buf4, torch.float32, (100, 300, 1)
+        )  # broadcast
+        with V.set_graph_handler(
+            MockGraphHandler(
+                {"buf0": buf0, "buf1": buf1, "buf2": buf2, "buf3": buf3, "buf4": buf4}
+            )
+        ):
+            reads, writes, renames, code = CutlassEVTCodegen.ir_to_evt_python_code(
+                "buf0",
+                [
+                    MockSchedulerNode(buf3),
+                    MockSchedulerNode(buf4, last_usage=OrderedSet(["buf0"])),
+                ],
+            )
+        self.assertExpectedInline(reads, """['buf0', 'buf1', 'buf2']""")
+        self.assertExpectedInline(writes, """['buf3', 'buf4']""")
+        self.assertExpectedInline(
+            renames, """{'buf3': 'D', 'buf4': 'tmp_3', 'buf0': 'accum'}"""
+        )
+        self.assertExpectedInline(
+            code,
+            """\
+def fn(accum, buf1, buf2):
+    tmp_0 = accum * buf1
+    tmp_1 = tmp_0 + buf2
+    D = tmp_1 # cutlass evt requirement
+    tmp_2 = D * D
+    tmp_3 = accum + tmp_2
+
+return D, tmp_3""",
+        )
+
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_py_codegen(self):
diff --git a/torch/_inductor/codegen/cuda/cutlass_python_evt.py b/torch/_inductor/codegen/cuda/cutlass_python_evt.py
@@ -69,6 +69,7 @@ def __init__(self, accumulator_node_name: str, last_usages: OrderedSet[str]):
         self.reads: OrderedSet[str] = OrderedSet()
         self.last_usages: OrderedSet[str] = OrderedSet()
         self.cur_node: Optional[ComputedBuffer] = None
+        self.name_to_buffer = V.graph.name_to_buffer | V.graph.graph_inputs
 
         if accumulator_node_name not in last_usages:
             self.store(accumulator_node_name, value=OpsValue(_ACCUMULATOR_ALIAS))
@@ -207,15 +208,22 @@ def _check_indexing(self, name: str, index: sympy.Expr) -> None:
         # We only support indexing that matches the layout today because
         # CUTLASS doesn't support arbitrary indexing
         buffer_name = self.accumulator_node_name if name == _ACCUMULATOR_ALIAS else name
-        buffer = V.graph.name_to_buffer[buffer_name]
+        buffer = self.name_to_buffer[buffer_name]
         index_strides = V.graph.sizevars.stride_vars(
             index, self._get_current_index_vars()
         )
-        if buffer.get_layout().stride != index_strides:
+        stride = buffer.get_layout().stride
+        if not self._stride_compatible(stride, index_strides):
             raise NotImplementedError(
-                f"Unsupported indexing for {name} with index {index} and strides {index_strides}"
+                f"Unsupported indexing for {name} with index {index}, index strides {index_strides}, and layout stride {stride}"
             )
 
+    def _stride_compatible(self, left, right):
+        return all(
+            sympy.Eq(l, r) or sympy.Eq(l, 0) or sympy.Eq(r, 0)
+            for l, r in (zip(left, right))
+        )
+
     def _render_input_signature(self) -> str:
         arguments = ", ".join(
             [_ACCUMULATOR_ALIAS]