Update

[ghstack-poisoned]
pytorch · ZhiweiYan-96 · Sep 26, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 17, 2024
commit 7fbe418f6b099f4cfa1aa32cc7a88d40b589f77f
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@@ -7,6 +7,18 @@ using namespace at::native::onednn;
 
 namespace at::native::xpu {
 
+static inline c10::ScalarType qlinear_decide_out_dtype(
+    const at::Tensor& act,
+    const std::optional<c10::ScalarType> output_dtype) {
+  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
+  bool bfloat16_output =
+      output_dtype.has_value() && (output_dtype == c10::kBFloat16);
+  auto dst_dtype = fp32_output
+      ? c10::kFloat
+      : (bfloat16_output ? c10::kBFloat16 : act.scalar_type());
+  return dst_dtype;
+}
+
 Tensor q_linear_pointwise(
     Tensor act,
     double act_scale,
@@ -38,12 +50,7 @@ Tensor q_linear_pointwise(
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
 
-  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
-  bool bfloat16_output =
-      output_dtype.has_value() && (output_dtype == c10::kBFloat16);
-  auto dst_dtype = fp32_output
-      ? c10::kFloat
-      : (bfloat16_output ? c10::kBFloat16 : act.scalar_type());
+  auto dst_dtype = qlinear_decide_out_dtype(act, output_dtype);
   Tensor qout = at::empty(dst_dims, act.options().dtype(dst_dtype));
 
   quantized_matmul(
@@ -102,12 +109,7 @@ Tensor q_linear_pointwise_tensor(
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
 
-  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
-  bool bfloat16_output =
-      output_dtype.has_value() && (output_dtype == c10::kBFloat16);
-  auto dst_dtype = fp32_output
-      ? c10::kFloat
-      : (bfloat16_output ? c10::kBFloat16 : act.scalar_type());
+  auto dst_dtype = qlinear_decide_out_dtype(act, output_dtype);
   Tensor qout = at::empty(dst_dims, act.options().dtype(dst_dtype));
 
   quantized_matmul(
@@ -169,12 +171,7 @@ Tensor q_linear_pointwise_binary(
 
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
-  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
-  bool bfloat16_output =
-      output_dtype.has_value() && (output_dtype == c10::kBFloat16);
-  auto dst_dtype = fp32_output
-      ? c10::kFloat
-      : (bfloat16_output ? c10::kBFloat16 : act.scalar_type());
+  auto dst_dtype = qlinear_decide_out_dtype(act, output_dtype);
   Tensor qout = at::empty(dst_dims, act.options().dtype(dst_dtype));
 
   quantized_matmul(
@@ -236,12 +233,7 @@ Tensor q_linear_pointwise_binary_tensor(
 
   std::vector<int64_t> src_dims = {M, K};
   std::vector<int64_t> dst_dims = {M, N};
-  bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
-  bool bfloat16_output =
-      output_dtype.has_value() && (output_dtype == c10::kBFloat16);
-  auto dst_dtype = fp32_output
-      ? c10::kFloat
-      : (bfloat16_output ? c10::kBFloat16 : act.scalar_type());
+  auto dst_dtype = qlinear_decide_out_dtype(act, output_dtype);
   Tensor qout = at::empty(dst_dims, act.options().dtype(dst_dtype));
 
   quantized_matmul(

diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -2461,6 +2461,7 @@ def test_qlinear_relu_xpu(self):
             (torch.randn((2, 4)).to(device="xpu"),), device="xpu"
         )
 
+    @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
     def test_qlinear_relu_int8_mixed_bf16(self):
@@ -2782,7 +2783,7 @@ def test_qlinear_add_int8_mixed_bf16_xpu(self, use_relu, is_qat, is_dynamic):
             is_dynamic=is_dynamic,
         )
 
-    def _qlinear_dequant_promotion_cpu_test_helper(
+    def _qlinear_dequant_promotion_test_helper(
         self,
         inputs,
         device="cpu",
@@ -2848,7 +2849,7 @@ def test_qlinear_dequant_promotion_cpu(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper((torch.randn((2, 4)),))
+        self._qlinear_dequant_promotion_test_helper((torch.randn((2, 4)),))
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -2866,7 +2867,7 @@ def test_qlinear_dequant_promotion_xpu(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 4)).to(device="xpu"),), device="xpu"
         )
 
@@ -2887,7 +2888,7 @@ def test_qlinear_dequant_promotion_int8_mixed_bf16(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 4)),), int8_mixed_bf16=True
         )
 
@@ -2909,7 +2910,7 @@ def test_qlinear_dequant_promotion_int8_mixed_bf16_xpu(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 4)).to(device="xpu"),), device="xpu", int8_mixed_bf16=True
         )
 
@@ -2928,7 +2929,7 @@ def test_qlinear_dequant_promotion_cpu_input_dim_exceeds_2(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper((torch.randn((2, 3, 4)),))
+        self._qlinear_dequant_promotion_test_helper((torch.randn((2, 3, 4)),))
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -2946,7 +2947,7 @@ def test_qlinear_dequant_promotion_input_dim_exceeds_2_xpu(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 3, 4)).to(device="xpu"),), device="xpu"
         )
 
@@ -2967,7 +2968,7 @@ def test_qlinear_dequant_promotion_int8_mixed_bf16_input_dim_exceeds_2(self):
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 3, 4)),), int8_mixed_bf16=True
         )
 
@@ -2989,7 +2990,7 @@ def test_qlinear_dequant_promotion_int8_mixed_bf16_input_dim_exceeds_2_xpu(self)
                   |
                   Y
         """
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 3, 4)).to(device="xpu"),),
             device="xpu",
             int8_mixed_bf16=True,
@@ -3019,7 +3020,7 @@ def matcher_check_fn():
                 counters["inductor"]["qlinear_weight_prepack_matcher_count"], 3
             )
 
-        self._qlinear_dequant_promotion_cpu_test_helper(
+        self._qlinear_dequant_promotion_test_helper(
             (torch.randn((2, 4)),),
             matcher_check_fn=matcher_check_fn,
             is_dynamic=True,