[SDPA-CUDNN] Make CuDNN Attention Opt in

drisspg · drisspg · commit df8058f16d3f · 2024-10-21T16:10:11.000-07:00
ghstack-source-id: 58f70e7 Pull Request resolved: #138522
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -64,20 +64,25 @@ bool check_prefer_cudnn_attention() {
 #endif
 }
 
+  // static const bool prefer_cudnn = check_prefer_cudnn_attention();
+  // return prefer_cudnn ? cudnn_order : default_order;
+  // return default_order
+// constexpr std::array<SDPBackend, num_backends> cudnn_order{
+//     SDPBackend::cudnn_attention,
+//     SDPBackend::flash_attention,
+//     SDPBackend::efficient_attention,
+//     SDPBackend::math,
+//     };
+
 // flash_attention V2 is universally faster than efficient_attention and Math
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
   constexpr std::array<SDPBackend, num_backends> default_order{
       SDPBackend::flash_attention,
-      SDPBackend::cudnn_attention,
       SDPBackend::efficient_attention,
-      SDPBackend::math};
-  constexpr std::array<SDPBackend, num_backends> cudnn_order{
+      SDPBackend::math,
       SDPBackend::cudnn_attention,
-      SDPBackend::flash_attention,
-      SDPBackend::efficient_attention,
-      SDPBackend::math};
-  static const bool prefer_cudnn = check_prefer_cudnn_attention();
-  return prefer_cudnn ? cudnn_order : default_order;
+      };
+  return default_order;
 }
 
 bool use_tensor_cores(sdp_params const& params, cudaDeviceProp* dprops, bool is_half) {