pytorch
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/Blas.cpp
Lines changed: 38 additions & 37 deletions b/‎aten/src/ATen/native/mkldnn/xpu/Blas.cpp
Lines changed: 38 additions & 37 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
Lines changed: 11 additions & 3 deletions b/‎aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
Lines changed: 11 additions & 3 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
Lines changed: 25 additions & 16 deletions b/‎aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp
Lines changed: 25 additions & 16 deletions
diff --git a/‎aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
Lines changed: 2 additions & 1 deletion
@@ -54,8 +54,6 @@ Tensor& addmm_out(
   TORCH_CHECK(
       !mat1.is_complex(), "Complex datatype matmul is not supported in oneDNN");
 
-  bool is_inplace = result.is_same(self);
-
   std::vector<int64_t> result_shape = {mat1.size(0), mat2.size(1)};
 
@@ -86,34 +84,36 @@ Tensor& addmm_out(
   Tensor bias = Tensor();
   onednn::Attr attr;
   float beta_ = beta.to<float>();
+  float alpha_ = beta_ == 0.f ? alpha.to<float>() : alpha.to<float>() / beta_;
   if (beta_ == 0.f) {
-    if (alpha.to<float>() != 1.f) {
+    attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
+  } else if (alpha_ == 1.f && beta_ == 1.f && !result.is_same(self)) {
+    // if result and self are the same tensor, we use post op sum.
+    bias = self;
+  } else {
+    Tensor binary = self.dim() == 1 ? self.unsqueeze(0) : self;
+    bool inplace = binary.is_same(result);
+    if (inplace) {
       attr.append_post_eltwise(
           1.f, alpha.to<float>(), 0.f, attr.kind_with_linear);
-    }
-  } else {
-    // We use post_binary here for adding self matrix.
-    // To avoid wrong write, here we clone self for inplace case.
-    if (alpha.to<float>() == 1.f && beta_ == 1.f) {
-      bias = is_inplace ? self.clone() : self;
+      attr.append_post_sum(beta_);
     } else {
-      Tensor binary;
-      // unsqueeze(0) here is to handle mv cases.
-      if (is_inplace)
-        binary = self.dim() == 1 ? self.unsqueeze(0).clone() : self.clone();
-      else
-        binary = self.dim() == 1 ? self.unsqueeze(0) : self;
+      if (at::native::onednn::is_broadcast(binary)) {
+        at::native::onednn::undo_broadcast(binary);
+      }
+      // in test_addmv_rowmajor_colmajor_incx_incy_lda, binary is a tensor with
+      // shape (5, 1) but stride(2, 2)
+      binary = at::native::onednn::is_onednn_matmul_strides(binary)
+          ? binary
+          : binary.contiguous();
       // Tensor binary = self.expand_as(result);
       // For post-binary-add, onednn needs binary scale=1.f
       // Thus we need the following transformation
       // alpha * matmul(mat1, mat2) + beta * binary
       // beta * (alpha/beta * matmul(src, wei) + binary)
-      float alpha_ = alpha.to<float>() / beta_;
-      if (alpha_ != 1.f)
-        attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
-      attr.append_post_binary(attr.kind_with_binary_add, binary);
-      if (beta_ != 1.f)
-        attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear);
+      attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
+      attr.append_post_binary<true>(attr.kind_with_binary_add, binary);
+      attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear);
     }
   }
   onednn::matmul(result, mat1, mat2, bias, true, attr);
@@ -185,8 +185,6 @@ Tensor& baddbmm_out(
   TORCH_CHECK(batch1.dim() == 3, "expected 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "expected 3D tensor");
 
-  bool is_inplace = result.is_same(input);
-
   std::vector<int64_t> result_shape = {
       batch1.size(0), batch1.size(1), batch2.size(2)};
   result.resize_(result_shape);
@@ -216,27 +214,30 @@ Tensor& baddbmm_out(
   // general case
   onednn::Attr attr;
   float beta_ = beta.to<float>();
+  float alpha_ = beta_ == 0.f ? alpha.to<float>() : alpha.to<float>() / beta_;
   Tensor binary;
   if (beta_ == 0.f) {
-    if (alpha.to<float>() != 1.f) {
-      attr.append_post_eltwise(
-          1.f, alpha.to<float>(), 0.f, attr.kind_with_linear);
-    }
+    attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
   } else {
-    // We use post_binary here for adding self matrix.
-    // To avoid wrong write, here we clone input for inplace case.
-    if (is_inplace)
-      binary = input.dim() < 3 ? input.unsqueeze(0).clone() : input.clone();
-    else
-      binary = input.dim() < 3 ? input.unsqueeze(0) : input;
+    binary = input.dim() < 3 ? input.unsqueeze(0) : input;
     // If input is a 1d tensor need be broadcasted, we need unsqueeze twice.
     binary = binary.dim() < 3 ? binary.unsqueeze_(0) : binary;
-    float alpha_ = alpha.to<float>() / beta_;
+    bool inplace = binary.is_same(result);
+    if (inplace) {
+      attr.append_post_eltwise(
+          1.f, alpha.to<float>(), 0.f, attr.kind_with_linear);
+      attr.append_post_sum(beta_);
+    } else {
+      if (at::native::onednn::is_broadcast(binary)) {
+        at::native::onednn::undo_broadcast(binary);
+      }
+      binary = at::native::onednn::is_onednn_matmul_strides(binary)
+          ? binary
+          : binary.contiguous();
       attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear);
-    attr.append_post_binary(attr.kind_with_binary_add, binary);
-    if (beta_ != 1.f)
+      attr.append_post_binary<true>(attr.kind_with_binary_add, binary);
       attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear);
+    }
   }
   onednn::matmul(result, batch1, batch2, at::Tensor(), true, attr);
   return result;
 
@@ -193,18 +193,26 @@ class Attr {
   }
 
   // append binary post op
+  template <bool is_matmul = false>
   Attr& append_post_binary(dnnl::algorithm algo, const at::Tensor& binary) {
     auto binary_ = binary.is_quantized() ? at::dequantize(binary) : binary;
     bool binary_is_channels_last =
         (binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast ||
          binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d);
 
-    binary_ = binary_is_channels_last ? binary_ : binary_.contiguous();
+    if constexpr (!is_matmul) {
+      binary_ = binary_is_channels_last ? binary_ : binary_.contiguous();
+    }
     dnnl::memory::desc md = get_onednn_md(binary_);
     auto expected_md = dnnl::memory::desc(
         md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::any);
-    ops_params_.push_back(
-        PostOpParam(binary_, md, expected_md, algo, kind_t::binary));
+    if constexpr (is_matmul) {
+      ops_params_.push_back(PostOpParam(binary_, md, md, algo, kind_t::binary));
+    } else {
+      ops_params_.push_back(
+          PostOpParam(binary_, md, expected_md, algo, kind_t::binary));
+    }
+
     return *this;
   }
 
 
@@ -42,7 +42,7 @@ sycl::event matmul(
   m1 = is_onednn_matmul_strides(m1) ? m1 : m1.contiguous();
   m2 = is_onednn_matmul_strides(m2) ? m2 : m2.contiguous();
   at::Tensor dst =
-      is_onednn_matmul_strides(result, true) ? result : result.contiguous();
+      is_onednn_matmul_strides(result) ? result : result.contiguous();
 
   int64_t m = dst.size(-2);
   int64_t n = dst.size(-1);
 
@@ -132,7 +132,7 @@ void quantized_matmul(
   at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
   at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
   at::Tensor dst =
-      is_onednn_matmul_strides(result, true) ? result : result.contiguous();
+      is_onednn_matmul_strides(result) ? result : result.contiguous();
 
   int64_t m = dst.size(-2);
   int64_t n = dst.size(-1);
 
@@ -257,14 +257,30 @@ void undo_broadcast_on_batch(at::Tensor& m1, at::Tensor& m2) {
                      {tensor.stride(dim_m), tensor.stride(dim_n)})
                  .unsqueeze(dim_b);
   }
+}
+
+void undo_broadcast(at::Tensor& tensor) {
+  // pytorch use stride = 0 for the dim to be broadcasted, but oneDNN only
+  // support shape(dim) = 1 to implicitly indicate the broadcast dim.
+  std::vector<int64_t> new_shape;
+  std::vector<int64_t> new_strides;
+  std::vector<int64_t> unsqueeze_dims;
+  for (int i = 0; i < tensor.dim(); i++) {
+    if (tensor.stride(i) == 0) {
+      unsqueeze_dims.push_back(i);
+    } else {
+      new_shape.push_back(tensor.size(i));
+      new_strides.push_back(tensor.stride(i));
+    }
+  }
+  tensor = tensor.as_strided(new_shape, new_strides);
+  for (size_t i = 0; i < unsqueeze_dims.size(); i++) {
+    tensor = tensor.unsqueeze(unsqueeze_dims[i]);
+  }
   return;
 }
 
-bool is_onednn_matmul_strides(const at::Tensor& tensor, bool is_dst) {
-  // TODO: We always call contiguous on dst.
-  // delete it after fix the case that dst is transposed on batch and m dim.
-  if (is_dst)
-    return false;
+bool is_onednn_matmul_strides(const at::Tensor& tensor) {
   // https://oneapi-src.github.io/oneDNN/dev_guide_matmul.html
   // oneDNN matmul only support 2-dim and 3-dim
   // 2D src(Mxk), wei(KxN), dst(MxN)
@@ -289,17 +305,10 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor, bool is_dst) {
   if (is_broadcast(tensor)) {
     return false;
   }
-  if (is_dst) {
-    // The memory format of the destination tensor should always be plain
-    // with n axis contiguous
-    if (strides[tensor_dim - 1] != 1)
-      return false;
-  } else {
-    // the src and weight must have at least one of the axes
-    // m or k and n or k contiguous (i.e., stride=1) respectively.
-    if (strides[tensor_dim - 1] != 1 && strides[tensor_dim - 2] != 1)
-      return false;
-  }
+  // the src and weight must have at least one of the axes
+  // m or k and n or k contiguous (i.e., stride=1) respectively.
+  if (strides[tensor_dim - 1] != 1 && strides[tensor_dim - 2] != 1)
+    return false;
 
   if (!onednn_strides_check(tensor))
     return false;
 
@@ -42,8 +42,9 @@ dnnl::memory::desc get_onednn_md(const at::Tensor& tensor);
 bool onednn_strides_check(const at::Tensor& src);
 bool is_broadcast(const at::Tensor& t);
 void undo_broadcast_on_batch(at::Tensor& m1, at::Tensor& m2);
+void undo_broadcast(at::Tensor& tensor);
 
-bool is_onednn_matmul_strides(const at::Tensor& tensor, bool is_dst = false);
+bool is_onednn_matmul_strides(const at::Tensor& tensor);
 
 bool is_broadcast_from_other_to_self(
     const at::Tensor& self,