pytorch
diff --git a/‎aten/src/ATen/native/mps/operations/Attention.mm
Lines changed: 76 additions & 54 deletions b/‎aten/src/ATen/native/mps/operations/Attention.mm
Lines changed: 76 additions & 54 deletions
diff --git a/‎test/test_mps.py
Lines changed: 33 additions & 0 deletions b/‎test/test_mps.py
Lines changed: 33 additions & 0 deletions
@@ -19,6 +19,15 @@
 namespace at {
 namespace native {
 
+// expand potential 3d to 4d tensor
+static inline std::tuple<Tensor, bool> ensure_4d(const Tensor& x) {
+  if (x.dim() == 3) {
+    return {x.unsqueeze(0), true};
+  } else {
+    return {x, false};
+  }
+}
+
 std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor& query,
                                                                   const Tensor& key,
                                                                   const Tensor& value,
@@ -39,6 +48,11 @@
   TORCH_CHECK(!query.is_nested() && !key.is_nested() && !value.is_nested(),
               "_scaled_dot_product_attention_math_for_mps: query, key, and value must not be nested");
 
+  // Ensure 4D tensors
+  auto [q_, sq] = ensure_4d(query);
+  auto [k_, sk] = ensure_4d(key);
+  auto [v_, sv] = ensure_4d(value);
+
   using namespace mps;
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -49,67 +63,70 @@
     MPSGraphTensor* outputTensor = nil;
     MPSGraphTensor* attnTensor = nil;
   };
-  int64_t batchSize = query.size(0);
-  int64_t num_head = query.size(1);
-  int64_t qSize = query.size(2);
-  int64_t headSize = query.size(3);
+  int64_t batchSize = q_.size(0);
+  int64_t num_head = q_.size(1);
+  int64_t qSize = q_.size(2);
+  int64_t headSize = q_.size(3);
+  int64_t maxSeqLength = k_.size(2);
   auto out = at::empty({batchSize, num_head, qSize, headSize}, query.options());
   auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, query.options());
   auto scale_factor = sdp::calculate_scale(query, scale).expect_float();
   @autoreleasepool {
-    auto mkey = __func__ + getTensorsStringKey({query, key, value}) + ":" + std::to_string(is_causal) + ":" +
+    auto mkey = __func__ + getTensorsStringKey({q_, k_, v_}) + ":" + std::to_string(is_causal) + ":" +
         std::to_string(attn_mask.has_value());
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&](auto mpsGraph, auto graph) {
-      auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, query);
-      auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, key);
-      auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, value);
-      auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
-      auto scaleTensor = [mpsGraph constantWithScalar:scale_factor shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
+    auto cachedGraph =
+        LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = q_, k_ = k_, v_ = v_](auto mpsGraph, auto graph) {
+          auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
+          auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
+          auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
+          auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
+          auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
+                                                    shape:getMPSShape({1})
+                                                 dataType:MPSDataTypeFloat32];
 
-      auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
+          auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
 
-      if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
-        // TODO: In MacOS15 beta, there is a MPSGraph issue when the SDPA sequence gets remapped to use
-        // an improved kernel for the computation, causing NaNs in the result. This identity prevents the remapping.
-        // Limit the availability check once a fix lands.
-        maskedMM = [mpsGraph identityWithTensor:maskedMM name:nil];
-      }
+          if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
+            // TODO: In MacOS15 beta, there is a MPSGraph issue when the SDPA sequence gets remapped to use
+            // an improved kernel for the computation, causing NaNs in the result. This identity prevents the remapping.
+            // Limit the availability check once a fix lands.
+            maskedMM = [mpsGraph identityWithTensor:maskedMM name:nil];
+
10000
          }
 
-      // upcasting to float32 if needed to improve precision when multiplying by the scale factor
-      if ([maskedMM dataType] != MPSDataTypeFloat32) {
-        maskedMM = [mpsGraph castTensor:maskedMM toType:MPSDataTypeFloat32 name:nil];
-      }
-      maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
-      if ([maskedMM dataType] != qTensor.dataType) {
-        maskedMM = [mpsGraph castTensor:maskedMM toType:qTensor.dataType name:nil];
-      }
+          // upcasting to float32 if needed to improve precision when multiplying by the scale factor
+          if ([maskedMM dataType] != MPSDataTypeFloat32) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:MPSDataTypeFloat32 name:nil];
+          }
+          maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+          if ([maskedMM dataType] != qTensor.dataType) {
+            maskedMM = [mpsGraph castTensor:maskedMM toType:qTensor.dataType name:nil];
+          }
 
-      if (is_causal) {
-        auto causalMask = [mpsGraph constantWithScalar:1.0f
-                                                 shape:getMPSShape({qSize, maxSeqLength})
-                                              dataType:MPSDataTypeBool];
-        causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
-        auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
-        maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
-                                   truePredicateTensor:maskedMM
-                                  falsePredicateTensor:minusInf
-                                                  name:nil];
-      } else if (attn_mask) {
-        graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
-        maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
-      }
-      auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
-      auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:sm secondaryTensor:vTensor name:nil];
-      graph->qTensor = qTensor;
-      graph->kTensor = kTensor;
-      graph->vTensor = vTensor;
-      graph->outputTensor = output;
-      graph->attnTensor = sm;
-    });
-    auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
-    auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
-    auto vPlaceholder = Placeholder(cachedGraph->vTensor, value);
+          if (is_causal) {
+            auto causalMask = [mpsGraph constantWithScalar:1.0f
+                                                     shape:getMPSShape({qSize, maxSeqLength})
+                                                  dataType:MPSDataTypeBool];
+            causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
+            auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
+            maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
+                                       truePredicateTensor:maskedMM
+                                      falsePredicateTensor:minusInf
+                                                      name:nil];
+          } else if (attn_mask) {
+            graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
+          }
+          auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
+          auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:sm secondaryTensor:vTensor name:nil];
+          graph->qTensor = qTensor;
+          graph->kTensor = kTensor;
+          graph->vTensor = vTensor;
+          graph->outputTensor = output;
+          graph->attnTensor = sm;
+        });
+    auto qPlaceholder = Placeholder(cachedGraph->qTensor, q_);
+    auto kPlaceholder = Placeholder(cachedGraph->kTensor, k_);
+    auto vPlaceholder = Placeholder(cachedGraph->vTensor, v_);
     auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, out);
     auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
     NSDictionary* feeds = nil;
@@ -122,8 +139,13 @@
     NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
     runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outs);
   }
-  return {out, attn};
+
+  // Squeeze back to 3D
+  auto final_out = (sq ? out.squeeze(0) : out);
+  auto final_attn = (sq ? attn.squeeze(0) : attn);
+
+  return {std::move(final_out), std::move(final_attn)};
 }
 
 } // namespace native
-} // namespace at
+} // namespace at
@@ -9915,6 +9915,39 @@ def test_sdpa_mask_fp16_L6(self):
     def test_sdpa_mask_fp16_L6_S17_NH23_HS121(self):
         self._test_sdpa_mask(torch.float16, 7, 17, 23, 121)
 
+    def _test_sdpa_3d_input(self, dtype):
+        head_num, seq_len, embed_dim = 16, 16, 80
+
+        q = torch.randn(head_num, seq_len, embed_dim, dtype=dtype)
+        k = torch.randn(head_num, seq_len, embed_dim, dtype=dtype)
+        v = torch.randn(head_num, seq_len, embed_dim, dtype=dtype)
+        attention_mask = torch.ones(1, seq_len, seq_len, dtype=dtype)
+
+        with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            y = F.scaled_dot_product_attention(
+                q.to("mps"),
+                k.to("mps"),
+                v.to("mps"),
+                attention_mask.to("mps"),
+                dropout_p=0.0
+            )
+
+            y_ref = F.scaled_dot_product_attention(
+                q.to("cpu"),
+                k.to("cpu"),
+                v.to("cpu"),
+                attention_mask.to("cpu"),
+                dropout_p=0.0
+            )
+
+            self._compare_tensors(y.cpu(), y_ref)
+
+    def test_sdpa_3d_input_fp32(self):
+        self._test_sdpa_3d_input(torch.float32)
+
+    def test_sdpa_3d_input_fp16(self):
+        self._test_sdpa_3d_input(torch.float16)
+
 
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):