pytorch
diff --git a/‎aten/src/ATen/native/transformers/cuda/attention.cu
Lines changed: 38 additions & 10 deletions b/‎aten/src/ATen/native/transformers/cuda/attention.cu
Lines changed: 38 additions & 10 deletions
diff --git a/‎aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
Lines changed: 6 additions & 5 deletions b/‎aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h
Lines changed: 6 additions & 5 deletions
diff --git a/‎test/inductor/test_flex_attention.py
Lines changed: 7 additions & 4 deletions b/‎test/inductor/test_flex_attention.py
Lines changed: 7 additions & 4 deletions
diff --git a/‎test/test_transformers.py
Lines changed: 86 additions & 0 deletions b/‎test/test_transformers.py
Lines changed: 86 additions & 0 deletions
diff --git a/‎torch/_meta_registrations.py
Lines changed: 51 additions & 2 deletions b/‎torch/_meta_registrations.py
Lines changed: 51 additions & 2 deletions
@@ -118,6 +118,37 @@ at::cuda::philox::unpack_cudnn<<<1, 1, 0, stream>>>(arg, seed_ptr, offset_ptr);
 namespace native {
 
 namespace {
+// Create output tensor with strides matching query layout
+at::Tensor create_output_with_matching_layout(
+  const at::Tensor& query,
+  at::IntArrayRef output_shape,
+  at::TensorOptions options
+) {
+// Get the "fill order" - an argsort on the strides of the query tensor
+const int dims = query.dim();
+std::vector<int64_t> fill_order(dims);
+std::iota(fill_order.begin(), fill_order.end(), 0);
+
+const auto query_strides = query.strides();
+std::stable_sort(
+    fill_order.begin(),
+    fill_order.end(),
+    [&query_strides](int64_t idx1, int64_t idx2) {
+      return query_strides[idx1] < query_strides[idx2];
+    });
+
+// Construct new strides that preserve the same layout ordering
+std::vector<int64_t> new_strides(dims);
+int64_t current_stride = 1;
+for (const int64_t dim_idx : fill_order) {
+  new_strides[dim_idx] = current_stride;
+  current_stride *= output_shape[dim_idx];
+}
+
+// Create tensor with the constructed strides
+return at::empty(output_shape, options)
+    .as_strided(output_shape, new_strides, 0);
+}
 
 
 static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4;
@@ -1433,11 +1464,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     }
     kernel_launched = true;
 
-    res = at::empty(
-        {B, M, num_heads, Kv},
-        query.options().dtype(
-            CutlassToAtenDtype<typename Kernel::output_t>::atScalarType()));
-
+    auto opts = query.options().dtype(CutlassToAtenDtype<typename Kernel::output_t>::atScalarType());
+    res = create_output_with_matching_layout(query, {B, M, num_heads, Kv}, opts);
     // NOTE: Should be aligned (by padding) in case M is
     // not a good number for loading during backward
     constexpr decltype(M) kAlignLSE = Kernel::kAlignLSE;
@@ -1455,11 +1483,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
         : nullptr;
     at::Tensor output_accum;
     if (Kernel::kNeedsOutputAccumulatorBuffer) {
-      output_accum = at::empty(
-          {B, M, num_heads, Kv},
-          query.options().dtype(
-              CutlassToAtenDtype<
-                  typename Kernel::output_accum_t>::atScalarType()));
+      auto opts = query.options().dtype(CutlassToAtenDtype<typename Kernel::output_t>::atScalarType());
+      output_accum = create_output_with_matching_layout(query, {B, M, num_heads, Kv}, opts);
       p.output_accum_ptr =
           (typename Kernel::output_accum_t*)output_accum.data_ptr();
     } else {
@@ -1494,12 +1519,15 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     ASSIGN_CHECK_OVERFLOW(p.q_strideB, query.stride(0));
     ASSIGN_CHECK_OVERFLOW(p.k_strideB, key.stride(0));
     ASSIGN_CHECK_OVERFLOW(p.v_strideB, value.stride(0));
+
     ASSIGN_CHECK_OVERFLOW(p.q_strideM, query.stride(1));
     ASSIGN_CHECK_OVERFLOW(p.k_strideM, key.stride(1));
     ASSIGN_CHECK_OVERFLOW(p.v_strideM, value.stride(1));
+
     ASSIGN_CHECK_OVERFLOW(p.q_strideH, query.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.k_strideH, key.stride(2));
     ASSIGN_CHECK_OVERFLOW(p.v_strideH, value.stride(2));
+
     ASSIGN_CHECK_OVERFLOW(p.o_strideM, res.stride(1));
 
     if (bias.has_value()) {
 
@@ -237,10 +237,12 @@ struct AttentionKernel {
         query_ptr += batch_id * q_strideB;
         key_ptr += batch_id * k_strideB;
         value_ptr += batch_id * v_strideB;
-        output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+        output_ptr += batch_id * q_strideB;
+
+        // Reuse q_strides since we want to guarantee exact match w/ input
         if (output_accum_ptr != nullptr) {
           output_accum_ptr +=
-              int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+              int64_t(batch_id * q_strideB);
         }
         q_start = 0;
         k_start = 0;
@@ -252,15 +254,14 @@ struct AttentionKernel {
 
       value_ptr += k_start * v_strideM + head_id * v_strideH;
       output_ptr +=
-          int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+          int64_t(q_start + query_start) * o_strideM + head_id * q_strideH;
 
       if (kSupportsBias && attn_bias_ptr != nullptr) {
         attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
       }
       if (output_accum_ptr != nullptr) {
         output_accum_ptr +=
-            int64_t(q_start + query_start) * (head_dim_value * num_heads) +
-            head_id * head_dim_value;
+            int64_t(q_start + query_start) * q_strideM + head_id * q_strideH;
       } else {
         // Accumulate directly in the destination buffer (eg for f32)
         output_accum_ptr = (accum_t*)output_ptr;
 
@@ -2937,12 +2937,15 @@ def test_flex_attention_backward_stride_ordering(
         )
         out = func(query, key, value)
         grad_output = torch.randn_like(out)
-        out.backward(grad_output)
+
+        grad_query, grad_key, grad_value = torch.autograd.grad(
+            out, [query, key, value], grad_output
+        )
 
         for leaf, grad, name in [
-            (query, query.grad, "query"),
-            (key, key.grad, "key"),
-            (value, value.grad, "value"),
+            (query, grad_query, "query"),
+            (key, grad_key, "key"),
+            (value, grad_value, "value"),
         ]:
             input_stride_order = get_stride_order(grad.stride())
             orig_stride_order = get_stride_order(leaf.stride())
 
@@ -21,6 +21,7 @@
 from typing import Optional
 import torch.utils.cpp_extension
 from torch.testing._internal.common_nn import NNTestCase
+from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.testing._internal.common_utils import (
     TEST_WITH_ROCM,
     skipIfRocm,
@@ -2469,6 +2470,7 @@ def test_cudnn_attention_different_dk_dv(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
+
     @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_gqa(self, device):
@@ -4285,6 +4287,89 @@ def test_is_causal_and_mask_fails(self, device):
         with self.assertRaisesRegex(ValueError, "CausalBias should not be used with causal=True"):
             scaled_dot_product_attention(query, key, value, attn_mask=attn_bias, is_causal=True, dropout_p=0.0)
 
+
+class TestSDPACompile(InductorTestCase):
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
+    @parametrize("backend", PLATFORM_SPECIFIC_SDPA, name_fn=lambda x: x.name)
+    @parametrize("compile_mode", ["eager", "inductor"])
+    @parametrize(
+        "permute_order",
+        [perm + (3,) for perm in itertools.permutations([0, 1, 2])],
+    )
+    @parametrize("shape", [(2, 4, 128, 16), (4, 2, 64, 32)])
+    def test_sdpa_stride_ordering_and_backward(self, device, backend, compile_mode, permute_order, shape):
+        from torch._inductor.ir import get_stride_order
+        make_tensor = partial(
+            torch.randn,
+            shape,
+            device=device,
+            dtype=torch.float16,
+            requires_grad=False,
+        )
+
+        # Create and permute tensors
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+        query = query.permute(permute_order)
+        key = key.permute(permute_order)
+        value = value.permute(permute_order)
+
+        # Create leaves
+        query.requires_grad_()
+        key.requires_grad_()
+        value.requires_grad_()
+
+        def run_sdpa(q, k, v):
+            return torch.nn.functional.scaled_dot_product_attention(q, k, v)
+
+        if compile_mode == "inductor":
+            run_sdpa = torch.compile(run_sdpa, backend="inductor", fullgraph=True)
+        else:
+            original_run_sdpa = run_sdpa
+
+            def run_sdpa(q, k, v):
+                with torch._subclasses.CrossRefFakeMode():
+                    return original_run_sdpa(q, k, v)
+
+
+        with sdpa_kernel(backends=[backend]):
+            out = run_sdpa(query, key, value)
+
+        # Check out and query
+        out_stride_order = get_stride_order(out.stride())
+        query_stride_order = get_stride_order(query.stride())
+
+        self.assertEqual(
+            out_stride_order,
+            query_stride_order,
+            f"Compile mode: {compile_mode}, Backend: {backend}, "
+            f"Forward: out {out_stride_order}, query {query_stride_order}",
+        )
+
+        grad_output = torch.randn_like(out)
+        cm = torch._subclasses.CrossRefFakeMode() if compile_mode == "eager" else contextlib.nullcontext()
+
+        with cm:
+            grad_query, grad_key, grad_value = torch.autograd.grad(out, [query, key, value], grad_output)
+
+        # Check that gradient stride orders match input stride orders
+        for leaf, grad, name in [
+            (query, grad_query, "query"),
+            (key, grad_key, "key"),
+            (value, grad_value, "value"),
+        ]:
+            grad_stride_order = get_stride_order(grad.stride())
+            input_stride_order = get_stride_order(leaf.stride())
+            self.assertEqual(
+                grad_stride_order,
+                input_stride_order,
+                f"Compile mode: {compile_mode}, Backend: {backend}, "
+                f"Backward for {name}: grad {grad_stride_order}, input {input_stride_order}",
+            )
+
+
+
+
 if NOTEST_CPU:
     device_types = ("cuda", )
 else:
@@ -4297,6 +4382,7 @@ def test_is_causal_and_mask_fails(self, device):
 instantiate_device_type_tests(TestSDPACpuOnly, globals(), only_for=("cpu"))
 instantiate_device_type_tests(TestAttnBias, globals(), only_for=device_types)
 instantiate_device_type_tests(TestSDPAXpuOnly, globals(), only_for="xpu", allow_xpu=True)
+instantiate_device_type_tests(TestSDPACompile, globals(), only_for=("cuda"))
 
 if __name__ == '__main__':
     run_tests()
@@ -101,6 +101,51 @@ def check_inplace_broadcast(self_shape, *args_shape):
     )
 
 
+def _construct_strides(
+    sizes: Sequence[int],
+    fill_order: Sequence[int],
+) -> Sequence[int]:
+    """From a list of sizes and a fill order, construct the strides of the permuted tensor."""
+    # Initialize strides
+    assert len(sizes) == len(fill_order), (
+        "Length of sizes must match the length of the fill order"
+    )
+    strides = [0] * len(sizes)
+
+    # Start with stride 1 for the innermost dimension
+    current_stride = 1
+
+    # Iterate through the fill order populating strides
+    for dim in fill_order:
+        strides[dim] = current_stride
+        current_stride *= sizes[dim]
+
+    return strides
+
+
+def _permute_strides(out: torch.Tensor, query_strides: tuple[int, ...]) -> torch.Tensor:
+    """
+    Create a new tensor with the same data and shape as the input,
+    but with strides permuted based on the input tensor's stride order.
+
+    Args:
+        out (torch.Tensor): The output tensor of attention.
+        query_strides (List[int]): The stride order of the input query tensor
+
+    Returns:
+        torch.Tensor: A new tensor with same shape and data as the input,
+        but with strides permuted based on the query tensor's stride order.
+    """
+    from torch._inductor.ir import get_fill_order
+
+    fill_order = get_fill_order(query_strides)
+    assert out.storage_offset() == 0, "Only support storage_offset == 0"
+    out_strides = _construc
EF80
t_strides(out.shape, fill_order)
+    new_out = out.new_empty(out.shape).as_strided(out.shape, out_strides)
+    new_out.copy_(out)
+    return new_out
+
+
 @register_meta([aten.linspace, aten.logspace])
 @out_wrapper()
 def meta_linspace_logspace(
@@ -5878,7 +5923,9 @@ def meta__scaled_dot_product_efficient_attention(
     num_heads = query.size(-2)
     Kv = value.size(-1)
 
-    res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
+    out_shape = (B, M, num_heads, Kv)
+    res = query.new_empty(out_shape)
+    res = _permute_strides(res, query.stride())
 
     if torch.version.hip and torch.cuda.is_available():
         """Please see: https://github.com/pytorch/pytorch/issues/146848
@@ -6131,7 +6178,9 @@ def meta__efficient_attention_forward(
     num_heads = query.size(-2)
     Kv = value.size(-1)
 
-    res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
+    out_shape = (B, M, num_heads, Kv)
+    res = query.new_empty(out_shape)
+    res = _permute_strides(res, query.stride())
 
     logsumexp_batch_dim = cu_seqlens_q.size(0) - 1 if (cu_seqlens_q is not None) else B
     actual_max_seqlen_q = M