fallback SDPA implict causal mask to explict for fp32 for performance

LuFinch · LuFinch · commit 469b77c5837b · 2025-05-14T01:27:22.000-07:00
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -329,6 +329,27 @@ void gpu_float_sdpa(
       {c10::kXPU, c10::xpu::current_device()});
   auto strm = GpuStreamManager::Instance().get_stream();
 
+  const auto get_tril_mask = [&]() {
+    auto opts = query.options();
+    auto bool_tril =
+        at::ones_symint(
+            {query.sym_size(-2), key.sym_size(-2)}, opts.dtype(at::kBool))
+            .tril();
+    return at::where(
+        bool_tril,
+        0.f,
+        at::scalar_tensor(-std::numeric_limits<float>::infinity(), opts));
+  };
+
+  // OneDNN doesn't support fp32 ukernel for implicit causal mask,
+  // and the reference implementation is worse than aten math + explict causal
+  // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32
+  // ukernel for implicit causal mask.
+  if (is_causal && query.dtype() == at::kFloat) {
+    attn_mask = get_tril_mask();
+    is_causal = false;
+  }
+
   std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
   std::optional<dnnl::graph::compiled_partition> compiled_partition;