Update

[ghstack-poisoned]
pytorch · ZhiweiYan-96 · Aug 13, 2024 · Aug 19, 2024 · Aug 21, 2024 · Aug 27, 2024
commit eacaafd8a1bb0a7731723fa842a129ab5fa11e89
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul_pt2e.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul_pt2e.cpp
@@ -10,18 +10,42 @@
 namespace at::native::onednn {
 
 void quantized_matmul_pt2(
-    at::Tensor& result,
-    const at::Tensor& mat1,
-    const at::Tensor& mat2,
-    const at::Tensor& b_raw,
-    bool m2_trans,
-    double input_scale,
-    int64_t input_zero_point,
-    at::Tensor& weight_scales,
-    at::Tensor& weight_zero_points,
-    double output_scale,
-    int64_t output_zero_point,
-    Attr attr) {
+  at::Tensor  mat1, // act
+  double input_scale,
+  int64_t input_zero_point,
+  at::Tensor  mat2, // weight
+  at::Tensor& weight_scales,
+  at::Tensor& weight_zero_points,
+  at::Tensor& b_raw,
+  at::Tensor result, // output
+  double output_scale,
+  int64_t output_zero_point,
+  std::optional<c10::ScalarType> output_dtype,
+  std::optional<at::Tensor> other, // extra input for binary-post-op
+  double other_scale,
+  int64_t other_zero_point,
+  const c10::string_view& binary_post_op,
+  double binary_alpha,
+  const c10::string_view& unary_post_op,
+  torch::List<std::optional<at::Scalar>>& unary_post_op_args,
+  c10::string_view unary_post_op_algorithm){
+
+  bool m2_trans = true;
+
+  auto attr = Attr(output_scale, output_zero_point);
+
+  construct_attr_by_post_op(
+    binary_post_op,
+    binary_alpha,
+    input_scale,
+    input_zero_point,
+    unary_post_op,
+    unary_post_op_args,
+    unary_post_op_algorithm,
+    attr
+  );
+
+
   size_t dims = result.dim();
   at::Device curDevice = at::Device(at::kXPU, c10::xpu::current_device());
   auto engine = GpuEngineManager::Instance().get_engine(curDevice);
@@ -63,7 +87,7 @@ void quantized_matmul_pt2(
     if (b.dim() == 1) {
       TORCH_CHECK(
           b.size(0) == n || b.size(0) == 1,
-          "matmul supports [n] or [1] when bias dim is 1 ...");
+          "matmul supports [n] or [1] when bias dim is 1, but b.size() is:", b.size(0));
       if (b.size(0) == 0) {
         with_bias = false;
       } else if (m1.dim() == 3) {
@@ -157,7 +181,7 @@ void quantized_matmul_pt2(
   std::unordered_map<int, dnnl::memory> args;
 
   dnnl::post_ops po;
-  // attr.extract_post_ops(dst, true);
+  attr.extract_post_ops(dst, true);
   bool m1_need_zp = (input_zero_point != 0);
   // wgh should never have zero point
   bool wgh_is_per_channel = weight_scales.numel() > 1;

diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -134,17 +134,25 @@ at::Tensor quantized_convolution_pt2(
     c10::optional<c10::string_view> unary_algorithm);
 
 void quantized_matmul_pt2(
-    at::Tensor& result,
-    const at::Tensor& mat1,
-    const at::Tensor& mat2,
-    const at::Tensor& b_raw,
-    bool m2_trans,
-    double input_scale,
-    int64_t input_zero_point,
-    at::Tensor& weight_scales,
-    at::Tensor& weight_zero_points,
-    double output_scale,
-    int64_t output_zero_point,
-    Attr attr); 
+  at::Tensor  mat1, // act
+  double input_scale,
+  int64_t input_zero_point,
+  at::Tensor  mat2, // weight
+  at::Tensor& weight_scales,
+  at::Tensor& weight_zero_points,
+  at::Tensor& b_raw,
+  at::Tensor result, // output
+  double output_scale,
+  int64_t output_zero_point,
+  std::optional<c10::ScalarType> output_dtype,
+  std::optional<at::Tensor> other, // extra input for binary-post-op
+  double other_scale,
+  int64_t other_zero_point,
+  const c10::string_view& binary_post_op,
+  double binary_alpha,
+  const c10::string_view& unary_post_op,
+  torch::List<std::optional<at::Scalar>>& unary_post_op_args,
+  c10::string_view unary_post_op_algorithm);
+
 
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear_pt2e.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear_pt2e.cpp
@@ -1,6 +1,7 @@
 #include <torch/library.h>
 
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include "c10/core/ScalarType.h"
 
 using namespace at::native::onednn;
 
@@ -22,6 +23,8 @@ Tensor q_linear_pointwise(
     c10::string_view post_op_name,
     torch::List<std::optional<at::Scalar>> post_op_args,
     c10::string_view post_op_algorithm) {
+
+
   Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
 
   const int64_t dim = act.dim();
@@ -34,21 +37,79 @@ Tensor q_linear_pointwise(
   std::vector<int64_t> dst_dims = {M, N};
   Tensor qout = at::empty(dst_dims, device(c10::kXPU).dtype(c10::kByte));
 
-  Attr attr = Attr();
+  quantized_matmul_pt2(
+    act.contiguous(),
+    act_scale,
+    act_zero_point,
+    weight.contiguous(),
+    weight_scales,
+    weight_zero_points,
+    b_raw,
+    qout,
+    output_scale,
+    output_zero_point,
+    output_dtype,
+    /*other*/ std::nullopt,
+    /*other scale*/ 1.0,
+    /*other zp*/0,
+    /*binary post op*/ "none",
+    /*binary alpha*/1.0,
+    post_op_name,
+    post_op_args,
+    post_op_algorithm
+  );
+
+  return qout;
+}
+
+Tensor q_linear_pointwise_tensor(
+  Tensor act,
+  Tensor act_scale,
+  Tensor act_zero_point,
+  Tensor weight,
+  Tensor weight_scales,
+  Tensor weight_zero_points,
+  std::optional<Tensor> bias,
+  double output_scale,
+  int64_t output_zero_point,
+  std::optional<c10::ScalarType> output_dtype,
+  c10::string_view post_op_name,
+  torch::List<std::optional<at::Scalar>> post_op_args,
+  c10::string_view post_op_algorithm
+){
+  Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();
+
+  const int64_t dim = act.dim();
+  int64_t K = act.size(dim - 1);
+  int64_t M = act.numel() / K;
+  // [M, K] x [K, N]
+  int64_t N = weight.size(1);
+
+  std::vector<int64_t> src_dims = {M, K};
+  std::vector<int64_t> dst_dims = {M, N};
+  Tensor qout = at::empty(dst_dims, device(c10::kXPU).dtype(c10::kByte));
 
   quantized_matmul_pt2(
-      qout,
-      act,
-      weight,
-      b_raw,
-      /*m2_trans=*/false,
-      act_scale,
-      act_zero_point,
-      weight_scales,
-      weight_zero_points,
-      output_scale,
-      output_zero_point,
-      attr);
+    act.contiguous(),
+    act_scale.item().toDouble(),
+    act_zero_point.item().toLong(),
+    weight.contiguous(),
+    weight_scales,
+    weight_zero_points,
+    b_raw,
+    qout,
+    output_scale,
+    output_zero_point,
+    output_dtype,
+    /*other*/ std::nullopt,
+    /*other scale*/ 1.0,
+    /*other zp*/0,
+    /*binary post op*/ "none",
+    /*binary alpha*/1.0,
+    post_op_name,
+    post_op_args,
+    post_op_algorithm
+  );
 
   return qout;
 }
@@ -57,14 +118,18 @@ Tensor q_linear_pointwise(
 at::Tensor q_linear_prepack_onednn(
     at::Tensor weight,
     c10::optional<torch::List<int64_t>> input_shape) {
-  return weight;
+  at::Tensor weight_transposed = weight.transpose(0, 1);
+  return weight_transposed;
 }
 
 
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
       TORCH_FN(q_linear_pointwise));
+  m.impl(
+    TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
+    TORCH_FN(q_linear_pointwise_tensor));
     m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
       TORCH_FN(q_linear_prepack_onednn));

diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -1419,6 +1419,7 @@ def matcher_check_fn():
     def _qlinear_cpu_test_helper(
         self,
         inputs,
+        device="cpu",
         int8_mixed_bf16=False,
         do_permute=False,
         matcher_check_fn=None,
@@ -1438,7 +1439,7 @@ def forward(self, x):
                     x = torch.reshape(torch.permute(x, (0, 2, 3, 1)), (2, 12, 4))
                 return self.linear2(self.linear(x))
 
-        mod = M(bias, do_permute=do_permute).eval().xpu()
+        mod = M(bias, do_permute=do_permute).eval().to(device=device)
-        mod = M(bias, do_permute=do_permute).eval().to(device=device)
+        mod = M(bias, do_permute=do_permute).eval().to(device=device)
+  
+        assert isinstance(inputs, Tuple)
+        def __convert_tensor_to_device(input: Any, device: str):
+            return input.to(device=device) if isinstance(input, Tensor) else input
+        inputs = tuple(__convert_tensor_to_device(input, device) for input in inputs)
-        mod = M(bias, do_permute=do_permute).eval().to(device=device)
+        mod = M(bias, do_permute=do_permute).eval().to(device=device)
+  
+        assert isinstance(inputs, Tuple)
+        def __convert_tensor_to_device(input: Any, device: str):
+            return input.to(device=device) if isinstance(input, Tensor) else input
+        inputs = tuple(__convert_tensor_to_device(input, device) for input in inputs)
 
         def _default_matcher_check_fn():
             self.assertEqual(
@@ -1459,12 +1460,12 @@ def _default_matcher_check_fn():
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
-    def test_qlinear_cpu(self):
+    def test_qlinear_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize a single Linear Moduel.
         """
         for bias in [True, False]:
-            self._qlinear_cpu_test_helper((torch.randn((2, 4)).xpu(),), bias=bias)
+            self._qlinear_cpu_test_helper((torch.randn((2, 4)).to(device=device),), device=device, bias=bias)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
@@ -1587,7 +1588,7 @@ def matcher_check_fn():
             )
 
     def _qlinear_unary_cpu_test_helper(
-        self, inputs, unary_op=torch.nn.ReLU(), int8_mixed_bf16=False
+        self, inputs,  unary_op=torch.nn.ReLU(), device="cpu", int8_mixed_bf16=False
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias):
@@ -1603,7 +1604,7 @@ def forward(self, x):
 
         bias_list = [True, False]
         for bias in bias_list:
-            mod = M(bias).eval()
+            mod = M(bias).eval().to(device=device)
 
             def matcher_check_fn():
                 # 1. dequant-linear pattern matched in quantization weight prepack
@@ -1623,11 +1624,11 @@ def matcher_check_fn():
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
-    def test_qlinear_relu_cpu(self):
+    def test_qlinear_relu_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize a Linear->ReLU pattern.
         """
-        self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)),))
+        self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)).to(device=device),), device=device)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
@@ -1661,12 +1662,12 @@ def test_qlinear_relu_int8_mixed_bf16_input_dim_exceeds_2(self):
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNN
-    def test_qlinear_gelu_cpu(self):
+    def test_qlinear_gelu_mkldnn(self, device="cpu"):
         r"""
         This testcase will quantize a Linear->GELU pattern.
         """
         for gelu in [torch.nn.GELU("none"), torch.nn.GELU("tanh")]:
-            self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)),), gelu)
+            self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)).to(device=device),), gelu, device=device)
 
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
@@ -2831,4 +2832,3 @@ def matcher_check_fn():
 if __name__ == "__main__":
     if IS_LINUX and HAS_CPU and torch.backends.mkldnn.is_available():
         run_tests()
-1