pytorch
diff --git a/‎test/inductor/test_torchinductor_strided_blocks.py
Lines changed: 29 additions & 0 deletions b/‎test/inductor/test_torchinductor_strided_blocks.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/simd.py
Lines changed: 2 additions & 2 deletions b/‎torch/_inductor/codegen/simd.py
Lines changed: 2 additions & 2 deletions
@@ -20,6 +20,7 @@
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_GPU,
+    requires_gpu,
     skip_windows_ci,
     TRITON_HAS_CPU,
 )
@@ -895,6 +896,34 @@ def func(x, y):
         )
         self.assertTrue("Min" not in code[0])
 
+    @requires_gpu()  # FIXME this test failed on Triton-CPU
+    def test_3d_permute_tiling(self):
+        """
+        Test 3D tiling with permute.
+        """
+
+        def foo(x, y, z):
+            dims = [0, 2, 1]
+            a = x.permute(dims=dims) + y
+            b = (z + y).permute(dims=dims)
+            return a + b
+
+        inps = (torch.rand((51, 51, 51), device=self.device, dtype=torch.float32),) * 3
+        result, (code,) = run_and_compare(
+            self,
+            foo,
+            *inps,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=3,
+            config_patches={
+                "triton.max_tiles": 3,
+                "triton.prefer_nd_tiling": True,
+            },
+        )
+
+        # Check for 3D tiling
+        self.assertIn("ZBLOCK", code)
+
 
 @unittest.skipIf(not TRITON_HAS_CPU, "requires triton CPU backend")
 @config.patch(cpu_backend="triton")
 
@@ -2011,8 +2011,8 @@ def select_tiling(
             def convert_tiling_to_3d(
                 tiling0: dict[str, sympy.Expr], tiling1: dict[str, sympy.Expr]
             ) -> Optional[dict[str, sympy.Expr]]:
-                a0, a1 = tiling0["x"], tiling0["y"]
-                b0, b1 = tiling1["x"], tiling1["y"]
+                a0, a1 = tiling0["x"], tiling0.get("y", 1)
+                b0, b1 = tiling1["x"], tiling1.get("y", 1)
                 if V.graph.sizevars.size_hint(a1 - b1) == 0:
                     return None
                 if V.graph.sizevars.size_hint(a1 - b1) < 0: