pytorch
diff --git a/‎aten/src/ATen/native/mps/operations/Attention.mm
Lines changed: 3 additions & 1 deletion b/‎aten/src/ATen/native/mps/operations/Attention.mm
Lines changed: 3 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 22 additions & 0 deletions b/‎aten/src/ATen/native/transformers/attention.cpp
Lines changed: 22 additions & 0 deletions
diff --git a/‎test/test_mps.py
Lines changed: 23 additions & 0 deletions b/‎test/test_mps.py
Lines changed: 23 additions & 0 deletions
@@ -44,7 +44,8 @@
     TORCH_CHECK(!attn_mask.has_value(),
                 "_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
   }
-
+  TORCH_CHECK(query.size(-3) == key.size(-3) && key.size(-3) == value.size(-3),
+              "number of heads in query/key/value should match");
   TORCH_CHECK(dropout_p == 0.0, "_scaled_dot_product_attention_math_for_mps: dropout_p != 0.0 is not supported");
   TORCH_CHECK(macOS15_0_plus || (query.is_contiguous() && key.is_contiguous() && value.is_contiguous()),
               "_scaled_dot_product_attention_math_for_mps: query, key, and value must be contiguous");
@@ -55,6 +56,7 @@
   auto [q_, sq] = ensure_4d(query);
   auto [k_, sk] = ensure_4d(key);
   auto [v_, sv] = ensure_4d(value);
+
   std::optional<Tensor> mask_;
   if (attn_mask) {
     auto maskExpandedDims = query.sizes().vec();
 
@@ -759,6 +759,28 @@ Tensor scaled_dot_product_attention(
           && !(GradMode::is_enabled() && any_inputs_require_grad)
           && (all_contiguous || mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS))
           && !any_nested) {
+        if (enable_gqa) {
+          int64_t q_heads = query_.size(-3);
+          int64_t k_heads = key.size(-3);
+          int64_t repeat_factor = q_heads / k_heads;
+
+          if (repeat_factor > 1) {
+            TORCH_CHECK(q_heads % k_heads == 0,
+                          "For GQA, the query tensor's head dimension (" + std::to_string(q_heads) +
+                                    ") must be divisible by the key tensor's head dimension (" + std::to_string(k_heads) + ").");
+            auto repeated_key = key.repeat_interleave(repeat_factor, /*dim=*/-3);
+            auto repeated_value = value.repeat_interleave(repeat_factor, /*dim=*/-3);
+            return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
+              query_,
+              repeated_key,
+              repeated_value,
+              attn_mask,
+              dropout_p,
+              is_causal,
+              std::nullopt, /*dropout_mask*/
+              scale));
+          }
+        }
         return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
             query_,
             key,
 
@@ -9909,6 +9909,29 @@ def test_sdpa_mask_5d(
         y_ref = F.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(), attn_mask=mask.cpu(), dropout_p=0.0, is_causal=False)
         self._compare_tensors(y.cpu(), y_ref)
 
+    @parametrize("dtype", [torch.float16, torch.float32])
+    @parametrize("is_causal", [True, False])
+    def test_sdpa_enable_gqa(self, dtype, is_causal):
+        q_heads = 32
+        key_heads = 16
+        L = 7
+        S = 17
+        HS = 23
+
+        q = torch.randn([2, q_heads, L, HS], dtype=dtype, device="mps")
+        k = torch.randn([2, key_heads, S, HS], dtype=dtype, device="mps")
+        v = torch.randn([2, key_heads, S, HS], dtype=dtype, device="mps")
+
+        y_ref = F.scaled_dot_product_attention(
+            q.cpu(), k.cpu(), v.cpu(), dropout_p=0.0, is_causal=is_causal, enable_gqa=True,
+        )
+
+        with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            y = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=0.0, is_causal=is_causal, enable_gqa=True,
+            )
+        self._compare_tensors(y.cpu(), y_ref)
+
 
 class TestGatherScatter(TestCaseMPS):
     def test_slicing_with_step(self):