[cuDNN][SDPA] Bail out of cuDNN SDPA for seqlen 1 inputs (pytorch#138531)

eqy · malfet · rahulsingh-intel · commit cc59e9162092 · 2024-10-29T13:00:11.000+02:00
Forwarded pytorch#138529 to the cuDNN team but for now but we want to avoid dispatching to unsupported cases Pull Request resolved: pytorch#138531 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -409,6 +409,12 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
       return false;
     }
   }
+  if (s_q == 1 || s_k == 1) {
+    if (debug) {
+      TORCH_WARN_ONCE("cudnn SDPA does not support sequence length 1.");
+    }
+    return false;
+  }
   return true;
 }
 
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -2463,6 +2463,31 @@ def test_cudnn_attention_different_dk_dv(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
+    @skipIfRocm  # No cuDNN Attention
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    def test_fused_attention_different_dk_dv(self, device):
+        dtype = torch.bfloat16
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True)
+        batch, num_heads, head_dim_k, head_dim_v = 32, 16, 128, 64
+        seq_len = 640
+        q_shape = SdpaShape(batch, num_heads, 1, head_dim_k)
+        k_shape = SdpaShape(batch, num_heads, 2, head_dim_k)
+        v_shape = SdpaShape(batch, num_heads, 2, head_dim_v)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        # test that we do not dispatch to cuDNN for an unsupported case
+        actual = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
+                query.contiguous().to(torch.float32),
+                key.contiguous().to(torch.float32),
+                value.contiguous().to(torch.float32),
+                attn_mask=None, dropout_p=0.0, is_causal=False)
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
+
     @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_fail_d128(self, device):

Original file line number	Diff line number	Diff line change
`@@ -409,6 +409,12 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {`
`409`	`409`	`return false;`
`410`	`410`	`}`
`411`	`411`	`}`
	`412`	`+ if (s_q == 1 \|\| s_k == 1) {`
	`413`	`+ if (debug) {`
	`414`	`+ TORCH_WARN_ONCE("cudnn SDPA does not support sequence length 1.");`
	`415`	`+ }`
	`416`	`+ return false;`
	`417`	`+ }`
`412`	`418`	`return true;`
`413`	`419`	`}`
`414`	`420`