pytorch · ZhiweiYan-96 · Oct 9, 2024 · Oct 12, 2024 · Oct 15, 2024 · Oct 16, 2024
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -137,6 +137,18 @@ std::array<at::SDPBackend, at::num_sdp_backends> Context::sDPPriorityOrder() {
   return sdp_priority_order;
 }
 
+bool Context::allowTF32OneDNN() const {
+  return allow_tf32_onednn;
+}
+
+void Context::setAllowTF32OneDNN(bool b){
+#ifdef USE_XPU
+  allow_tf32_onednn = b;
+#else
+  TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support.");
+#endif
+}
+
 bool Context::userEnabledFlashSDP() const {
   return enabled_flashSDP;
 }

diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
@@ -333,6 +333,8 @@ class TORCH_API Context {
   void setFloat32MatmulPrecision(const std::string& s);
   bool allowTF32CuDNN() const;
   void setAllowTF32CuDNN(bool);
+  bool allowTF32OneDNN() const;
+  void setAllowTF32OneDNN(bool);
   bool allowTF32CuBLAS() const;
   void setAllowTF32CuBLAS(bool);
   Float32MatmulPrecision float32MatmulPrecision() const;
@@ -422,6 +424,7 @@ class TORCH_API Context {
   bool allow_bf16_reduction_cublas = true;
   bool allow_fp16_accumulation_cublas = false;
   bool enabled_mkldnn = true;
+  bool allow_tf32_onednn = false;
   bool enabled_nnpack = true;
   at::LinalgBackend linalg_preferred_backend =
       c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true

diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
@@ -120,6 +120,8 @@ sycl::event convolution(
   }
 #endif
 
+  at::native::onednn::apply_tf32_if_allowed(pattr);
+
   auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc(
       engine,
       dnnl::prop_kind::forward,
@@ -211,6 +213,8 @@ sycl::event convolution_backward_weights(
   }
 #endif
 
+  at::native::onednn::apply_tf32_if_allowed(pattr);
+
   pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
   auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc(
       engine,
@@ -319,6 +323,9 @@ sycl::event convolution_backward_data(
   dnnl::memory::dims _padding_back_bottom_right =
       padding_back_bottom_right.vec();
   dnnl::memory::dims _dilation = compatible_dilation(dilation);
+
+  at::native::onednn::apply_tf32_if_allowed(pattr);
+
   auto conv_forward_pd = dnnl::convolution_forward::primitive_desc(
       engine,
       dnnl::prop_kind::forward,

diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
@@ -1,5 +1,8 @@
+#include <ATen/Context.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/mkldnn/xpu/detail/Utils.h>
+#include <dnnl.hpp>
+#include <dnnl_common.hpp>
 
 namespace at::native::onednn {
 
@@ -487,4 +490,12 @@ dnnl::memory::format_tag conv_weight_fmt(
   }
 }
 
+void apply_tf32_if_allowed(dnnl::primitive_attr& pattr) {
+  auto& ctx = at::globalContext();
+  bool allow_tf32 = ctx.allowTF32OneDNN();
+  if (allow_tf32) {
+    pattr.set_fpmath_mode(dnnl::fpmath_mode::tf32);
+  }
+}
+
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
@@ -49,6 +49,8 @@ bool is_broadcast_from_other_to_self(
 
 at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim);
 
+void apply_tf32_if_allowed(dnnl::primitive_attr& primitive_attr);
+
 bool binary_valid(
     const at::Tensor& self,
     const at::Tensor& other,

diff --git a/test/xpu/test_conv.py b/test/xpu/test_conv.py
@@ -1258,6 +1258,17 @@ def test_channels_last_ouput_stride(self, device, dtype):
         # input NHWC, output NHWC
         assert_size_stride(out, (2, 512, 7, 7), (25088, 1, 3584, 512))
 
+    @onlyXPU
+    def test_onednn_allow_tf32_get_set(self):
+        with torch.backends.mkldnn.flags(
+            enabled=None, deterministic=None, allow_tf32=False
+        ):
+            self.assertFalse(torch.backends.mkldnn.allow_tf32)
+        with torch.backends.mkldnn.flags(
+            enabled=None, deterministic=None, allow_tf32=True
+        ):
+           
F438
 self.assertTrue(torch.backends.mkldnn.allow_tf32)
+
 
 instantiate_device_type_tests(
     TestConvolutionNNDeviceType, globals(), only_for="xpu", allow_xpu=True

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1181,6 +1181,8 @@ def _get_cudnn_deterministic() -> _bool: ...  # THPModule_deterministicCuDNN
 def _set_cudnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicCuDNN
 def _get_mkldnn_deterministic() -> _bool: ...  # THPModule_deterministicMkldnn
 def _set_mkldnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicMkldnn
+def _get_onednn_allow_tf32() -> _bool: ... # THPModule_allowTF32OneDNN
+def _set_onednn_allow_tf32(arg: _bool) -> None: ... # THPModule_setAllowTF32OneDNN
 def _get_deterministic_algorithms() -> _bool: ...  # THPModule_deterministicAlgorithms
 def _get_deterministic_algorithms_warn_only() -> _bool: ...  # THPModule_deterministicAlgorithmsWarnOnly
 def _set_deterministic_algorithms(

diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
@@ -64,18 +64,25 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return False
 
 
-def set_flags(_enabled, _deterministic=None):
-    orig_flags = (torch._C._get_mkldnn_enabled(), torch._C._get_mkldnn_deterministic())
-    torch._C._set_mkldnn_enabled(_enabled)
+def set_flags(_enabled=None, _deterministic=None, _allow_tf32=None):
+    orig_flags = (
+        torch._C._get_mkldnn_enabled(),
+        torch._C._get_mkldnn_deterministic(),
+        torch._C._get_onednn_allow_tf32(),
+    )
+    if _enabled is not None:
+        torch._C._set_mkldnn_enabled(_enabled)
     if _deterministic is not None:
         torch._C._set_mkldnn_deterministic(_deterministic)
+    if _allow_tf32 is not None:
+        torch._C._set_onednn_allow_tf32(_allow_tf32)
     return orig_flags
 
 
 @contextmanager
-def flags(enabled=False, deterministic=False):
+def flags(enabled=False, deterministic=False, allow_tf32=True):
     with __allow_nonbracketed_mutation():
-        orig_flags = set_flags(enabled, deterministic)
+        orig_flags = set_flags(enabled, deterministic, allow_tf32)
     try:
         yield
     finally:
@@ -91,10 +98,14 @@ def __init__(self, m, name):
     deterministic = ContextProp(
         torch._C._get_mkldnn_deterministic, torch._C._set_mkldnn_deterministic
     )
+    allow_tf32 = ContextProp(
+        torch._C._get_onednn_allow_tf32, torch._C._set_onednn_allow_tf32
+    )
 
 
 if TYPE_CHECKING:
     enabled: ContextProp
     deterministic: ContextProp
+    allow_tf32: ContextProp
 
 sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -947,6 +947,29 @@ static PyObject* THPModule_setDeterministicAlgorithms(
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPModule_setAllowTF32OneDNN(
+    PyObject* _unsued,
+    PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "_set_onednn_allow_tf32 expects a bool, "
+      "but got ",
+      THPUtils_typename(arg));
+  at::globalContext().setAllowTF32OneDNN(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THPModule_allowTF32OneDNN(
+    PyObject* _unused,
+    PyObject* noargs) {
+  if (at::globalContext().allowTF32OneDNN())
+    Py_RETURN_TRUE;
+  else
+    Py_RETURN_FALSE;
+}
+
 static PyObject* THPModule_deterministicAlgorithms(
     PyObject* _unused,
     PyObject* noargs) {
@@ -1527,6 +1550,8 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
     {"_set_mkldnn_enabled", THPModule_setUserEnabledMkldnn, METH_O, nullptr},
     {"_get_cudnn_allow_tf32", THPModule_allowTF32CuDNN, METH_NOARGS, nullptr},
     {"_set_cudnn_allow_tf32", THPModule_setAllowTF32CuDNN, METH_O, nullptr},
+    {"_get_onednn_allow_tf32", THPModule_allowTF32OneDNN, METH_NOARGS, nullptr},
+    {"_set_onednn_allow_tf32", THPModule_setAllowTF32OneDNN, METH_O, nullptr},
     {"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS, nullptr},
     {"_set_cudnn_benchmark", THPModule_setBenchmarkCuDNN, METH_O, nullptr},
     {"_get_cudnn_deterministic",