pytorch
diff --git a/‎aten/src/ATen/native/mkldnn/Linear.cpp
+11-1 b/‎aten/src/ATen/native/mkldnn/Linear.cpp
+11-1
diff --git a/‎test/inductor/test_mkldnn_pattern_matcher.py
+30-5 b/‎test/inductor/test_mkldnn_pattern_matcher.py
+30-5
diff --git a/‎torch/_inductor/fx_passes/mkldnn_fusion.py
+27-7 b/‎torch/_inductor/fx_passes/mkldnn_fusion.py
+27-7
@@ -56,6 +56,10 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
 
 namespace at::native {
 
+static bool use_mkldnn_bf32_linear() {
+  return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
+}
+
 Tensor mkldnn_linear(
     const Tensor& self,
     const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
@@ -231,7 +235,9 @@ Tensor mkldnn_linear_pointwise(
         it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
     op_attr = it->second(scalars, algorithm);
   }
-
+  if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
@@ -318,6 +324,10 @@ Tensor mkldnn_linear_pointwise_binary(
   auto other_desc = mkldnn_other.get_desc();
   auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
 
+  if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
+    op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
+  }
+
   if (mkldnn_bias.has_value()) {
     ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
         mkldnn_input,
 
@@ -323,6 +323,7 @@ def test_conv2d_unary_cpu(self):
     def test_conv3d_unary_cpu(self):
         self._test_conv_unary_cpu_base(dim=5)
 
+    @bf32_on_and_off()
     def test_linear_unary(self):
         class M(torch.nn.Module):
             def __init__(
@@ -351,6 +352,8 @@ def forward(self, x):
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
             dtypes.append(torch.float16)
+        if torch.backends.mkldnn.matmul.fp32_precision == "bf16":
+            dtypes.append(torch.float32)
         options = itertools.product(unary_list, [True, False], dtypes)
         for unary_fn, bias, dtype in options:
             metrics.reset()
@@ -361,7 +364,7 @@ def forward(self, x):
 
             def matcher_check_fn():
                 match_nodes = unary_list[unary_fn]
-                if self._check_unary_is_decomposed(unary_fn):
+                if dtype != torch.float32 and self._check_unary_is_decomposed(unary_fn):
                     # Has extra dtype conversion nodes for autocast.
                     match_nodes += 2
                 self.assertEqual(
@@ -373,9 +376,15 @@ def matcher_check_fn():
                 )
 
             self._test_common(mod, (v,), matcher_check_fn, check_autocast=dtype)
-            # only generated 1 kernel for "to"
-            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+            expected_kernel_count = 1
+            if TEST_ACL:
+                expected_kernel_count = 2
+            elif dtype == torch.float32:
+                expected_kernel_count = 0
+            # only generated 1 kernel for "to_dtype"
+            self.assertEqual(metrics.generated_kernel_count, expected_kernel_count)
 
+    @bf32_on_and_off()
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     def test_linear_fp32(self):
         class M(torch.nn.Module):
@@ -793,6 +802,7 @@ def test_conv2d_binary_broadcast_shapes_cpu(self):
     def test_conv3d_binary_broadcast_shapes_cpu(self):
         self._test_conv_binary_broadcast_shapes_base(dim=5)
 
+    @bf32_on_and_off()
     def test_linear_binary(self):
         class M(torch.nn.Module):
             def __init__(self, binary_fn, in_channels, out_channels, bias, **kwargs):
@@ -812,6 +822,8 @@ def forward(self, x, y):
             dtypes.append(torch.bfloat16)
         if torch.ops.mkldnn._is_mkldnn_fp16_supported():
             dtypes.append(torch.float16)
+        if torch.backends.mkldnn.matmul.fp32_precision == "bf16":
+            dtypes.append(torch.float32)
         options = itertools.product(
             binary_list, [[2, 3, 10], [2, 10]], [True, False], dtypes
         )
@@ -848,7 +860,13 @@ def matcher_check_fn():
                 matcher_check_fn,
                 check_autocast=dtype,
             )
-            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+            expected_kernel_count = 1
+            if TEST_ACL:
+                expected_kernel_count = 2
+            elif dtype == torch.float32:
+                expected_kernel_count = 0
+            # only generated 1 kernel for "to_dtype"
+            self.assertEqual(metrics.generated_kernel_count, expected_kernel_count)
 
     def test_linear_binary_broadcast_shapes_cpu(self):
         class M(torch.nn.Module):
@@ -911,7 +929,13 @@ def matcher_check_fn():
                 matcher_check_fn,
                 check_autocast=dtype,
             )
-            self.assertEqual(metrics.generated_kernel_count, 2 if TEST_ACL else 1)
+            expected_kernel_count = 1
+            if TEST_ACL:
+                expected_kernel_count = 2
+            elif dtype == torch.float32:
+                expected_kernel_count = 0
+            # only generated 1 kernel for "to_dtype"
+            self.assertEqual(metrics.generated_kernel_count, expected_kernel_count)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -944,6 +968,7 @@ def matcher_check_fn():
 
         self._test_common(mod, (x1, x2), matcher_check_fn)
 
+    @bf32_on_and_off()
     def test_multi_linear_share_same_input(self):
         # llama pattern.
         class M(torch.nn.Module):
 
@@ -935,10 +935,20 @@ def is_linear_add_bias(match):
             bias_meta = add_node.args[1].meta.get("val")
             if weight_meta is None or bias_meta is None:
                 return False
-            assert weight_meta.dtype in (
-                torch.bfloat16,
-                torch.float16,
+
+            bf32_matmul_enabled = torch.backends.mkldnn.matmul.fp32_precision == "bf16"  # type: ignore[attr-defined]
+            use_bf16_for_fp32_weight = (
+                bf32_matmul_enabled and weight_meta.dtype == torch.float32
+            )
+            assert (
+                weight_meta.dtype
+                in (
+                    torch.bfloat16,
+                    torch.float16,
+                )
+                or use_bf16_for_fp32_weight
             )
+
             if bias_meta.dtype != weight_meta.dtype:
                 return False
             return (
@@ -1098,10 +1108,15 @@ def is_const_or_cat_by_const(weight):
             torch.bfloat16,
             torch.float16,
         )
+        bf32_matmul_enabled = torch.backends.mkldnn.matmul.fp32_precision == "bf16"  # type: ignore[attr-defined]
+        use_bf16_for_fp32_weight = (
+            bf32_matmul_enabled and weight_meta_value.dtype == torch.float32
+        )
+        compute_with_lp = is_lp_weight or use_bf16_for_fp32_weight
         # on x86, for fp32, mkl should be enabled and batch_size should not be a free symbol.
         # on aarch64, use mkldnn op for fp32 as well if acl is enabled
         if (
-            not is_lp_weight
+            not compute_with_lp
             and not mkldnn._is_mkldnn_acl_supported()
             and ((not torch._C.has_mkl) or has_free_symbols(batch_size))
         ):
@@ -1308,9 +1323,14 @@ def linear(match, *args, **kwargs):
                     torch.bfloat16,
                     torch.float16,
                 )
+                bf32_matmul_enabled = torch.backends.mkldnn.matmul.fp32_precision == "bf16"  # type: ignore[attr-defined]
+                use_bf16_for_fp32_weight = (
+                    bf32_matmul_enabled and weight_dtype == torch.float32
+                )
+                compute_with_lp = is_lp_weight or use_bf16_for_fp32_weight
                 batch_size = input.meta.get("val").shape[0]
                 if has_free_symbols(batch_size):
-                    assert is_lp_weight or mkldnn._is_mkldnn_acl_supported(), (
+                    assert compute_with_lp or mkldnn._is_mkldnn_acl_supported(), (
                         f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
                     )
                 # For bfloat16 dynamic shape path, using input size hint to pack weight for a better performance.
@@ -1328,7 +1348,7 @@ def linear(match, *args, **kwargs):
                 packed_weight_op = (
                     mkldnn._reorder_linear_weight
                     if (
-                        is_lp_weight
+                        compute_with_lp
                         or mkldnn._is_mkldnn_acl_supported()
                         or V.aot_compilation
                     )
@@ -1340,7 +1360,7 @@ def linear(match, *args, **kwargs):
 
                 packed_linear_inputs: tuple[Any, ...] = (input, packed_weight_node)
                 if (
-                    is_lp_weight
+                    compute_with_lp
                     or mkldnn._is_mkldnn_acl_supported()
                     or V.aot_compilation
                 ):