pytorch
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/qconv_pt2e.cpp
Lines changed: 5 additions & 4 deletions b/‎aten/src/ATen/native/mkldnn/xpu/qconv_pt2e.cpp
Lines changed: 5 additions & 4 deletions
diff --git a/‎test/inductor/test_mkldnn_pattern_matcher.py
Lines changed: 34 additions & 19 deletions b/‎test/inductor/test_mkldnn_pattern_matcher.py
Lines changed: 34 additions & 19 deletions
diff --git a/‎torch/_inductor/mkldnn_ir.py
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/mkldnn_ir.py
Lines changed: 1 addition & 1 deletion
@@ -4,6 +4,7 @@
 #include <torch/library.h>
 
 #include <iostream>
+#include "c10/core/ScalarType.h"
 
 using namespace at::native::onednn;
 namespace at {
@@ -164,11 +165,11 @@ class QConvoneDNNXPU final {
         stride.vec(),
         dilation.vec());
 
-    // TODO: handle difference of this dtype with argument dtype
-    // auto dtype =
-    //     (act.scalar_type() == c10::ScalarType::Byte) ? c10::kByte : c10::kChar;
+    bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
+    bool bfloat16_output = output_dtype.has_value() && (output_dtype == c10::kBFloat16);
+    auto dst_dtype = fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte);
     Tensor output = at::empty(
-        dst_tz, device(c10::kXPU).dtype(output_dtype).memory_format(mfmt));
+        dst_tz, device(c10::kXPU).dtype(dst_dtype).memory_format(mfmt));
 
     return quantized_convolution_pt2(
         act,
 
@@ -143,20 +143,31 @@ def _test_common(
     ):
         counters.clear()
         torch._dynamo.reset()
+        is_xpu = False
+        for input in inputs:
+            is_xpu = is_xpu or (input.device.type == "xpu")
         assert matcher_check_fn is not None or (
             matcher_count is not None and matcher_nodes is not None
         )
         if (
-            check_autocast == torch.bfloat16
-            and torch.ops.mkldnn._is_mkldnn_bf16_supported()
+            (check_autocast == torch.bfloat16
+            and torch.ops.mkldnn._is_mkldnn_bf16_supported()) or
+            is_xpu
         ):
-            maybe_autocast = torch.cpu.amp.autocast(dtype=torch.bfloat16)
+            if is_xpu:
+                maybe_autocast = torch.amp.autocast(device_type="xpu", dtype=torch.bfloat16)
+            else:
+                maybe_autocast = torch.cpu.amp.autocast(dtype=torch.bfloat16)
             atol, rtol = 1e-2, 1e-2
         elif (
-            check_autocast == torch.float16
-            and torch.ops.mkldnn._is_mkldnn_fp16_supported()
+            (check_autocast == torch.float16
+            and torch.ops.mkldnn._is_mkldnn_fp16_supported()) or
+            is_xpu
         ):
-            maybe_autocast = torch.cpu.amp.autocast(dtype=torch.float16)
+            if is_xpu:
+                maybe_autocast = torch.amp.autocast(device_type="xpu", dtype=torch.float16)
+            else:
+                maybe_autocast = torch.cpu.amp.autocast(dtype=torch.float16)
             atol, rtol = 1e-2, 1e-2
         else:
             assert check_autocast == torch.float32
@@ -196,6 +207,7 @@ def _test_common(
                     )
                 if matcher_check_fn is not None:
                     matcher_check_fn()
+        print("===== Finish one test =====")
 
     def _test_code_common(
         self,
@@ -713,11 +725,11 @@ def test_qconv2d_mkldnn(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_int8_mixed_bf16(self):
+    def test_qconv2d_int8_mixed_bf16(self, device="cpu"):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
         """
-        self._qconv2d_cpu_test_helper(int8_mixed_bf16=True)
+        self._qconv2d_cpu_test_helper(device=device, int8_mixed_bf16=True)
 
     def _qconv2d_unary_cpu_test_helper(
         self,
@@ -776,15 +788,15 @@ def test_qconv2d_relu_mkldnn(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_relu_int8_mixed_bf16(self):
+    def test_qconv2d_relu_int8_mixed_bf16(self, device="cpu"):
         r"""
         This testcase will quantize Conv2d->ReLU pattern with int8_mixed_bf16 quantization.
         """
-        self._qconv2d_unary_cpu_test_helper(int8_mixed_bf16=True)
+        self._qconv2d_unary_cpu_test_helper(device=device, int8_mixed_bf16=True)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
-    def test_qconv2d_relu6_cpu(self, device):
+    def test_qconv2d_relu6_mkldnn(self, device):
         r"""
         This testcase will quantize Conv2d->ReLU6 pattern.
         """
@@ -801,14 +813,15 @@ def test_qconv2d_hardtanh_mkldnn(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_hardtanh_int8_mixed_bf16_cpu(self):
+    def test_qconv2d_hardtanh_int8_mixed_bf16_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize Conv2d->Hardtanh pattern.
         Match.nodes:
             [qconv2d_pointwise_default, convert_element_type, clamp_min, clamp_max, convert_element_type, quantize_per_tensor]
             [qconv2d_pointwise_default, convert_element_type, clamp_min, clamp_max, convert_element_type]
         """
         self._qconv2d_unary_cpu_test_helper(
+            device=device,
             unary_op=torch.nn.Hardtanh(),
             int8_mixed_bf16=True,
             qconv2d_unary_matcher_nodes=11,
@@ -825,7 +838,7 @@ def test_qconv2d_hardswish_mkldnn(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_hardswish_int8_mixed_bf16_cpu(self):
+    def test_qconv2d_hardswish_int8_mixed_bf16_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize Conv2d->Hardswish pattern.
         Match.nodes:
@@ -834,6 +847,7 @@ def test_qconv2d_hardswish_int8_mixed_bf16_cpu(self):
             [qconv2d_pointwise_default, convert_element_type, add, clamp_min, clamp_max, mul, div, convert_element_type]
         """
         self._qconv2d_unary_cpu_test_helper(
+            device=device,
             unary_op=torch.nn.Hardswish(),
             int8_mixed_bf16=True,
             qconv2d_unary_matcher_nodes=17,
@@ -850,7 +864,7 @@ def test_qconv2d_silu_mkldnn(self, device):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_silu_int8_mixed_bf16_cpu(self):
+    def test_qconv2d_silu_int8_mixed_bf16_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize Conv2d->SiLU pattern.
         Match.nodes:
@@ -859,6 +873,7 @@ def test_qconv2d_silu_int8_mixed_bf16_cpu(self):
             [qconv2d_pointwise_default, convert_element_type, sigmoid, mul, convert_element_type]
         """
         self._qconv2d_unary_cpu_test_helper(
+            device=device,
             unary_op=torch.nn.SiLU(),
             int8_mixed_bf16=True,
             qconv2d_unary_matcher_nodes=11,
@@ -941,8 +956,8 @@ def test_qconv2d_add_mkldnn(self, device="cpu"):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_add_int8_mixed_bf16(self):
-        self._qconv2d_add_cpu_test_helper(int8_mixed_bf16=True)
+    def test_qconv2d_add_int8_mixed_bf16(self, device="cpu"):
+        self._qconv2d_add_cpu_test_helper(device=device, int8_mixed_bf16=True)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -952,8 +967,8 @@ def test_qconv2d_add_relu_mkldnn(self, device="cpu"):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    def test_qconv2d_add_relu_int8_mixed_bf16(self):
-        self._qconv2d_add_cpu_test_helper(use_relu=True, int8_mixed_bf16=True)
+    def test_qconv2d_add_relu_int8_mixed_bf16(self, device="cpu"):
+        self._qconv2d_add_cpu_test_helper(device=device, use_relu=True, int8_mixed_bf16=True)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -2826,7 +2841,7 @@ def matcher_check_fn():
                 quantizer=quantizer,
             )
 
-device_types = ("xpu")
+device_types = ("xpu", "cpu")
 instantiate_device_type_tests(TestPatternMatcher, globals(), only_for=device_types, allow_xpu=True)
 
 if __name__ == "__main__":
 
@@ -161,7 +161,7 @@ def _original_deconv_weight_size(
     else:
         output_stride = make_channels_last_strides_for(output_size)
 
-    assert x.get_device().type in ["xpu", "xpu"] and weight.get_device().type in ["cpu", "xpu"]
+    assert x.get_device().type in ["xpu", "cpu"] and weight.get_device().type in ["cpu", "xpu"]
     inputs = [x, weight]
 
     kernel_layout = FixedLayout(