@@ -64,20 +64,25 @@ bool check_prefer_cudnn_attention() {
64
64
#endif
65
65
}
66
66
67
+ // static const bool prefer_cudnn = check_prefer_cudnn_attention();
68
+ // return prefer_cudnn ? cudnn_order : default_order;
69
+ // return default_order
70
+ // constexpr std::array<SDPBackend, num_backends> cudnn_order{
71
+ // SDPBackend::cudnn_attention,
72
+ // SDPBackend::flash_attention,
73
+ // SDPBackend::efficient_attention,
74
+ // SDPBackend::math,
75
+ // };
76
+
67
77
// flash_attention V2 is universally faster than efficient_attention and Math
68
78
std::array<SDPBackend, num_backends> priority_order (sdp_params const & params) {
69
79
constexpr std::array<SDPBackend, num_backends> default_order{
70
80
SDPBackend::flash_attention,
71
- SDPBackend::cudnn_attention,
72
81
SDPBackend::efficient_attention,
73
- SDPBackend::math};
74
- constexpr std::array<SDPBackend, num_backends> cudnn_order{
82
+ SDPBackend::math,
75
83
SDPBackend::cudnn_attention,
76
- SDPBackend::flash_attention,
77
- SDPBackend::efficient_attention,
78
- SDPBackend::math};
79
- static const bool prefer_cudnn = check_prefer_cudnn_attention ();
80
- return prefer_cudnn ? cudnn_order : default_order;
84
+ };
85
+ return default_order;
81
86
}
82
87
83
88
bool use_tensor_cores (sdp_params const & params, cudaDeviceProp* dprops, bool is_half) {
0 commit comments