pytorch
diff --git a/‎test/inductor/test_cutlass_evt.py
Lines changed: 50 additions & 16 deletions b/‎test/inductor/test_cutlass_evt.py
Lines changed: 50 additions & 16 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py b/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py
diff --git a/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
Lines changed: 69 additions & 3 deletions b/‎torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
Lines changed: 69 additions & 3 deletions
@@ -3,7 +3,10 @@
 
 import torch
 from torch._dynamo.test_case import TestCase
-from torch._inductor.codegen.cuda.cutlass_utils import try_import_cutlass
+from torch._inductor.codegen.cuda.cutlass_utils import (
+    torch_dtype_to_cutlass_type,
+    try_import_cutlass,
+)
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 
@@ -55,33 +58,64 @@
     class MockTileDescription:
         threadblock_shape = (128, 128, 8)
 
-    def _create_mock_buffer_name_map(example_tensors):
-        class MockNode:
-            def __init__(self, name, stride, dtype):
-                self.name = name
-                self.dtype = dtype
-                self.stride = stride
+    class MockNode:
+        def __init__(self, name, shape, stride, dtype):
+            self.name = name
+            self.dtype = dtype
+            self.shape = shape
+            self.stride = stride
 
-            def get_layout(self):
-                class MockLayout:
-                    def __init__(self, stride, dtype):
-                        self.dtype = dtype
-                        self.stride = stride
+        def get_layout(self):
+            class MockLayout:
+                def __init__(self, shape, stride, dtype):
+                    self.size = shape
+                    self.stride = stride
+                    self.dtype = dtype
 
-                return MockLayout(self.stride, self.dtype)
+            return MockLayout(self.shape, self.stride, self.dtype)
 
-            def get_name(self):
-                return self.name
+        def get_name(self):
+            return self.name
 
+    def _create_mock_buffer_name_map(example_tensors):
         name_to_buffer = {}
         for name, tensor in example_tensors.items():
             if isinstance(tensor, CutlassTensor):
-                name_to_buffer[name] = MockNode(name, tensor.stride, torch.float32)
+                name_to_buffer[name] = MockNode(
+                    name, tensor.shape, tensor.stride, torch.float32
+                )
 
         return name_to_buffer
 
 
 class TestCutlassEVT(TestCase):
+    @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
+    def test_example_tensor_creation(self):
+        from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
+            create_example_tensors,
+        )
+
+        row_major_buf0 = MockNode("buf0", (3, 4, 1), (4, 1, 0), torch.float32)
+        col_major_buf1 = MockNode("buf1", (3, 2, 1), (1, 3, 0), torch.float32)
+        read_names = ["buf0"]
+        write_names = ["buf1"]
+        buffer_renames = {"buf0": "acc"}
+        name_to_buffer = {"buf0": row_major_buf0, "buf1": col_major_buf1}
+        result = create_example_tensors(
+            read_names, write_names, buffer_renames, name_to_buffer
+        )
+        self.assertEqual(result["acc"].shape, (3, 4, 1))
+        self.assertEqual(result["acc"].stride, (4, 1, 0))
+        self.assertEqual(
+            result["acc"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+
+        self.assertEqual(result["buf1"].shape, (3, 2, 1))
+        self.assertEqual(result["buf1"].stride, (1, 3, 0))
+        self.assertEqual(
+            result["buf1"].element, torch_dtype_to_cutlass_type(torch.float32)
+        )
+
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
     def test_evt_argument_codegen(self):
         epilogue_functor = _trace(BIAS_CODE, EXAMPLE_TENSORS)
 
@@ -1,9 +1,13 @@
 from typing import Any, Union
 
-from torch._inductor.ir import ComputedBuffer, InputBuffer
+from torch._inductor.ir import (
+    ComputedBuffer,
+    InputBuffer,
+    is_contiguous_strides_for_shape,
+)
 from torch.utils._ordered_set import OrderedSet
 
-from ..cutlass_utils import try_import_cutlass
+from ..cutlass_utils import torch_dtype_to_cutlass_type, try_import_cutlass
 
 
 EpilogueFunctor = Any  # EpilogueFunctor local class defined in _trace
@@ -19,6 +23,7 @@
     import ast
     import ctypes
     import textwrap
+    from typing import Union
 
     from cutlass.backend.c_types import (  # type: ignore[import-untyped, import-not-found]
         EmptyByte,
@@ -41,13 +46,74 @@
     from cutlass.backend.evt.ir.tensor import (  # type: ignore[import-untyped, import-not-found]
         Tensor as CutlassTensor,
     )
-    from cutlass_library import DataType, EpilogueScheduleType, TileDescription
+    from cutlass_library import (
+        DataType,
+        EpilogueScheduleType,
+        LayoutType,
+        TileDescription,
+    )
 
+    import torch
     from torch._inductor.codegen.cuda import cuda_env
     from torch._inductor.utils import IndentedBuffer
 
     _CUTLASS_C_DTYPES = OrderedSet(dtype2ctype.values())  # type: ignore[var-annotated]
 
+    TORCH_TO_CUTLASS_DTYPE = {
+        torch.float32: DataType.f32,
+        torch.float16: DataType.f16,
+        torch.bfloat16: DataType.bf16,
+    }
+
+    def create_example_tensors(
+        read_names: list[str],
+        write_names: list[str],
+        buffer_renames: dict[str, str],
+        name_to_buffer: dict[str, Buffer],
+    ) -> dict[str, CutlassTensor]:
+        example_tensors = {}
+
+        def cutlass_tensor_from_buffer(buffer: Buffer) -> CutlassTensor:
+            shape = buffer.get_layout().size
+            stride = buffer.get_layout().stride
+            assert all(isinstance(x, int) for x in buffer.get_layout().stride), (
+                f"{buffer.get_name()}'s shape {shape} contains symints which aren't supported for cutlass EVT"
+            )
+            assert all(isinstance(x, int) for x in buffer.get_layout().stride), (
+                f"{buffer.get_name()}'s stride {stride} contains symints which aren't supported for cutlass EVT"
+            )
+            shape = tuple(int(x) for x in shape)
+            stride = tuple(int(x) for x in stride)
+
+            is_row_major = is_contiguous_strides_for_shape(stride, shape)
+            is_column_major = is_contiguous_strides_for_shape(stride[::-1], shape[::-1])
+
+            if not is_row_major and not is_column_major:
+                raise RuntimeError(
+                    f"Cannot create example tensor for {buffer.get_name()} with \
+non-contiguous layout, recieved stride: {stride} and shape: {shape}"
+                )
+
+            return CutlassTensor(
+                shape=shape,
+                layout_tag=LayoutType.RowMajor
+                if is_row_major
+                else LayoutType.ColumnMajor,
+                element=torch_dtype_to_cutlass_type(buffer.get_layout().dtype),
+            )
+
+        for name in read_names + write_names:
+            key = name
+
+            if name in buffer_renames:
+                key = buffer_renames[
+                    name
+                ]  # Need to rewrite some special args (e.g. acc is a required arg name)
+
+            example_tensors[key] = cutlass_tensor_from_buffer(name_to_buffer[name])
+
+        return example_tensors
+
     def trace(
         fn_src: str,
         example_tensors: dict[str, CutlassTensor],