pytorch
diff --git a/‎aten/src/ATen/Context.cpp‎
Lines changed: 6 additions & 3 deletions b/‎aten/src/ATen/Context.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/CPUBlas.cpp‎
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/CPUBlas.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mkldnn/Conv.cpp‎
Lines changed: 39 additions & 0 deletions b/‎aten/src/ATen/native/mkldnn/Conv.cpp‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/Linear.cpp‎
Lines changed: 12 additions & 0 deletions b/‎aten/src/ATen/native/mkldnn/Linear.cpp‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/Matmul.cpp‎
Lines changed: 58 additions & 56 deletions b/‎aten/src/ATen/native/mkldnn/Matmul.cpp‎
Lines changed: 58 additions & 56 deletions
@@ -27,7 +27,7 @@ namespace {
   These const variables defined the fp32 precisions for different backend
   We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
   prevision from "ieee", "tf32", "bf16" and "none". The "ieee" precision means
-  IEEE standard floating point format "tf32" and "bf16" means we are allowed to
+  IEEE standard floating point format, "tf32" and "bf16" means we are allowed to
   use "tf32" or "bf16" as internal computation data types for fp32 computations.
   And "none" means it is override-able by parent's node
 
@@ -40,7 +40,7 @@ namespace {
 */
 const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
     {"generic", {{"ieee", "tf32", "bf16", "none"}}},
-    {"mkldnn", {{"ieee", "bf16", "none"}}},
+    {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
     {"cuda", {{"ieee", "tf32", "none"}}}};
 
 // Check whether the backend and op are legal
@@ -368,6 +368,9 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
   invalid = invalid ||
       (float32Precision("mkldnn", "matmul") == "bf16" &&
        float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
+  invalid = invalid ||
+      (float32Precision("mkldnn", "matmul") == "tf32" &&
+       float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
   TORCH_CHECK(
       !invalid,
       "PyTorch is checking the matmul precision without a specific backend name,",
@@ -401,7 +404,7 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
     } else if (s_ == "high") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
       setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "ieee");
+      setFloat32Precision("mkldnn", "matmul", "tf32");
       return true;
     } else if (s_ == "medium") {
       float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
 
@@ -202,7 +202,7 @@ void gemm(
     float *c, int64_t ldc) {
   internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
 #if AT_MKLDNN_ENABLED()
-   if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+   if (mkldnn_reduced_f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
      return;
    }
 #endif
 
@@ -160,6 +160,10 @@ static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
       mkldnn_bf16_device_check();
 }
 
+static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
+  return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" &&
+      cpuinfo_has_x86_amx_fp16();
+}
 
 static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
    auto memory_format =  at::MemoryFormat::Contiguous;
@@ -271,6 +275,10 @@ static Tensor _mkldnn_convolution(
       input_t.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input_t.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   _mkldnn_convolution_out(
       input_t,
       weight_t,
@@ -455,6 +463,9 @@ Tensor mkldnn_convolution_pointwise_binary(
     if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
     }
+    if (mkldnn_conv_enabled_fpmath_mode_tf32() && input_t.scalar_type() ==at::kFloat){
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+
}    }
 
     if (bias.defined()) {
       const ideep::tensor b = itensor_from_tensor(bias);
@@ -597,6 +608,10 @@ Tensor& mkldnn_convolution_pointwise_binary_(
         input_t.scalar_type() == at::kFloat) {
       op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
     }
+    if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+        input_t.scalar_type() == at::kFloat) {
+      op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+    }
     _mkldnn_convolution_out(
         input_t,
         weight_t,
@@ -718,6 +733,9 @@ Tensor _mkldnn_convolution_transpose(
   if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() && input_t.scalar_type() ==at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
 
   if (bias.defined()) {
     const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
@@ -808,6 +826,10 @@ Tensor mkldnn_convolution_backward_input(
       weight.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   ideep::convolution_backward_data::compute_v2(
       grad_y,
       w,
@@ -828,6 +850,11 @@ Tensor mkldnn_convolution_backward_input(
     TORCH_WARN_ONCE(
         "Unexpected ideep version to support fpmath_mode_bf16, please update ideep version to align with pytorch main branch");
       }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    TORCH_WARN_ONCE(
+        "Unexpected ideep version to support fpmath_mode_tf32, please update ideep version to align with pytorch main branch");
+      }
 #endif
 
@@ -858,6 +885,10 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
       input.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   if (bias_defined) {
     ideep::convolution_backward_weights::compute_v2(
         x,
@@ -1011,6 +1042,10 @@ Tensor mkldnn_convolution_transpose_backward_input(
       weight.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      weight.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   ideep::convolution_transpose_backward_data::compute_v3(
       grad_y,
       w,
@@ -1053,6 +1088,10 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
       input.scalar_type() == at::kFloat) {
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (mkldnn_conv_enabled_fpmath_mode_tf32() &&
+      input.scalar_type() == at::kFloat) {
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   if (bias_defined) {
     ideep::convolution_transpose_backward_weights::compute_v3(
         x,
 
@@ -73,6 +73,11 @@ static bool use_mkldnn_bf32_linear() {
       mkldnn_bf16_device_check();
 }
 
+static bool use_mkldnn_tf32_linear() {
+  return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" &&
+      cpuinfo_has_x86_amx_fp16();
+}
+
 Tensor mkldnn_linear(
     const Tensor& self,
     const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
@@ -259,6 +264,9 @@ Tensor mkldnn_linear_pointwise(
   if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
+  if (u
3044
se_mkldnn_tf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
@@ -352,6 +360,10 @@ Tensor mkldnn_linear_pointwise_binary(
     op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
   }
 
+  if (use_mkldnn_tf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32);
+  }
+
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
 
@@ -1,7 +1,8 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <ATen/Context.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/mkldnn/Matmul.h>
 
 #if !AT_MKLDNN_ENABLED()
@@ -53,7 +54,7 @@ bool mkldnn_fp16_gemm(
     c10::Half *c, int64_t ldc) {
   return false;
 }
-bool mkldnn_bf32_gemm(
+bool mkldnn_reduced_f32_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
@@ -85,6 +86,13 @@ void mkldnn_matmul_i8i8i32(
   TORCH_INTERNAL_ASSERT(false, __func__, ": ATen not compiled with MKLDNN support");
 }
 
+bool use_mkldnn_tf32_matmul(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Tensor& result) {
+    return false;
+}
+
 } // namespace at::native
 
 
@@ -107,6 +115,10 @@ static bool use_mkldnn_bf32_matmul() {
   return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
 }
 
+static bool use_mkldnn_tf32_matmul() {
+  return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32";
+}
+
 // returns an ideep::tensor
 // - dims: shape e.g: {M,N}
 // - idtype: ideep data type e.g: (f32, bf16, f16)
@@ -144,7 +156,8 @@ mkldnn_gemm(
   bool bf16_usable = std::is_same_v<scalar_t, c10::BFloat16> && use_mkldnn_bf16_matmul();
   bool fp16_usable = std::is_same_v<scalar_t, c10::Half> && use_mkldnn_fp16_matmul();
   bool bf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_bf32_matmul();
-  if ( !(bf16_usable || fp16_usable || bf32_usable) ||
+  bool tf32_usable = std::is_same_v<scalar_t, float> && use_mkldnn_tf32_matmul();
+  if ( !(bf16_usable || fp16_usable || bf32_usable || tf32_usable) ||
       (m * n * k <= 16 * 16 * 16) || (alpha == 0.0f)) {
     return false;
   }
@@ -155,6 +168,7 @@ mkldnn_gemm(
     op_attr = ideep::attr_t::fuse_sum();
   }
   if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+  if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path
 
   // NOTE: View as c-contiguous to avoid extra reordering in mkldnn
   // Use identity: C = AB <=> C^T = B^T A^T
@@ -281,7 +295,7 @@ bool mkldnn_fp16_gemm(
   return mkldnn_gemm<c10::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-bool mkldnn_bf32_gemm(
+bool mkldnn_reduced_f32_gemm(
     TransposeType transa, TransposeType transb,
     int64_t m, int64_t n, int64_t k,
     float alpha,
@@ -339,13 +353,15 @@ void mkldnn_matmul(
   auto mat2_unsqueezed = mat2.dim() == 1 ? mat2.unsqueeze(1) : mat2;
   auto result_unsqueezed = result.dim() == 1 ? result.unsqueeze(1) : result;
   bool bf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_bf32_matmul();
+  bool tf32_usable = mat1.scalar_type() == at::kFloat && use_mkldnn_tf32_matmul();
 
   ideep::attr_t op_attr;
   // "addmm", "addbmm" "baddbmm" in pytorch allow bias to be 2-D or 3-D tensor
   // but mkldnn matmul primitive only support bias be 1-D tensors
   // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over
   if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum();
   if (bf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path
+  if (tf32_usable) op_attr.set_fpmath_mode(dnnl_fpmath_mode_tf32); // tf32 path
   // If alpha = 0, dose not need actually do gemm computation
   if (alpha == 0)
     return;
@@ -412,70 +428,56 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   }
 }
 
-bool use_mkldnn_bf16_matmul(
+template <typename T>
+bool use_mkldnn_typed_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
+  bool dtype_check = false;
+  if constexpr (std::is_same_v<T, c10::BFloat16>) {
 #if defined(__aarch64__)
-  if (mkldnn_bf16_device_check_arm()) {
-     //onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g. Arm Neoverse V1
-     //so, don't restrict the mkldnn_matmul only for bf16 inputs, allow it for float as well
-     return (
-        use_mkldnn_bf16_matmul() &&
-        (mat1.scalar_type() == mat2.scalar_type()) && (!result.defined() || (mat1.scalar_type() == result.scalar_type())) &&
-        ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16)) &&
-        mat1.numel() != 0 &&
-        mat2.numel() != 0 &&
-        checksize(mat1, mat2));
-  } else
+    if (mkldnn_bf16_device_check_arm()) {
+      // onednn fastmath mode can leverage bf16 HW even for the fp32 input, e.g.
+      // Arm Neoverse V1 so, don't restrict the mkldnn_matmul only for bf16
+      // inputs, allow it for float as well
+      dtype_check = use_mkldnn_bf16_matmul() &&
+          ((mat1.scalar_type() == kFloat) || (mat1.scalar_type() == kBFloat16));
+    }
+#else
+    dtype_check = dtype_check && use_mkldnn_bf16_matmul() &&
+        (mat1.scalar_type() == kBFloat16);
 #endif
-  {
-     return (
-        use_mkldnn_bf16_matmul() &&
-        mat1.scalar_type() == kBFloat16 &&
-        mat2.scalar_type() == kBFloat16 &&
-        (!result.defined() || result.scalar_type() == kBFloat16) &&
-        mat1.numel() != 0 &&
-        mat2.numel() != 0 &&
-        checksize(mat1, mat2));
+  } else if constexpr (std::is_same_v<T, c10::Half>) {
+    dtype_check = dtype_check && use_mkldnn_fp16_matmul() &&
+        (mat1.scalar_type() == kHalf);
+  } else if constexpr (std::is_same_v<T, float>) {
+    dtype_check = dtype_check &&
+        (use_mkldnn_bf32_matmul() || use_mkldnn_tf32_matmul()) &&
+        (mat1.scalar_type() == kFloat);
   }
-}
-
-bool use_mkldnn_fp16_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& result) {
-
-    return (
-      use_mkldnn_fp16_matmul() &&
-      mat1.scalar_type() == kHalf &&
-      mat2.scalar_type() == kHalf &&
-      (!result.defined() || result.scalar_type() == kHalf) &&
-      mat1.numel() != 0 &&
-      mat2.numel() != 0 &&
-      checksize(mat1, mat2));
-}
-
-bool use_mkldnn_bf32_matmul(
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Tensor& result) {
-
-    return (
-      use_mkldnn_bf32_matmul() &&
-      mat1.scalar_type() == kFloat &&
-      mat2.scalar_type() == kFloat &&
-      (!result.defined() || result.scalar_type() == kFloat) &&
-      mat1.numel() != 0 &&
-      mat2.numel() != 0 &&
-      checksize(mat1, mat2));
+  if (!dtype_check) {
+    return false;
+  }
+  bool size_check =
+      mat1.numel() != 0 && mat2.numel() != 0 && checksize(mat1, mat2);
+  dtype_check = (mat1.scalar_type() == mat2.scalar_type()) &&
+      (!result.defined() || result.scalar_type() == mat1.scalar_type());
+  return dtype_check && size_check;
 }
 
 bool use_mkldnn_matmul(
     const Tensor& mat1,
     const Tensor& mat2,
     const Tensor& result) {
-  return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result) || use_mkldnn_bf32_matmul(mat1, mat2, result));
+  auto mat1_type = mat1.scalar_type();
+  if (mat1_type != kBFloat16 || mat1_type != kHalf || mat1_type != kFloat) {
+    return false;
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      kBFloat16, kHalf, mat1.scalar_type(), "use_mkldnn_matmul", [&] {
+        return use_mkldnn_typed_matmul<scalar_t>(mat1, mat2, result);
+      });
+  return false;
 }
 
 static void _mkldnn_matmul_i8i8i32_with_primitive(
Original file line number	Diff line number	Diff line change
`@@ -202,7 +202,7 @@ void gemm(`
`202`	`202`	`float *c, int64_t ldc) {`
`203`	`203`	`internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);`
`204`	`204`	`#if AT_MKLDNN_ENABLED()`
`205`		`- if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {`
	`205`	`+ if (mkldnn_reduced_f32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {`
`206`	`206`	`return;`
`207`	`207`	`}`
`208`	`208`	`#endif`