8000 [Intel GPU] qlinear at XPU backend by ZhiweiYan-96 · Pull Request #133307 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

[Intel GPU] qlinear at XPU backend #133307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 52 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
bed8720
Update
ZhiweiYan-96 Aug 13, 2024
06bf0f2
Update
ZhiweiYan-96 Aug 19, 2024
f2136d8
Update
ZhiweiYan-96 Aug 21, 2024
a3c07aa
Update
ZhiweiYan-96 Aug 27, 2024
0dc3ab2
Update
ZhiweiYan-96 Sep 3, 2024
eacaafd
Update
ZhiweiYan-96 Sep 5, 2024
9d02aa4
Update
ZhiweiYan-96 Sep 5, 2024
b458071
Update
ZhiweiYan-96 Oct 9, 2024
8718de2
Update
8000 ZhiweiYan-96 Oct 17, 2024
d6c7879
Update
ZhiweiYan-96 Oct 21, 2024
c75fcad
Update
ZhiweiYan-96 Oct 23, 2024
3a5307c
Update
ZhiweiYan-96 Oct 23, 2024
e479ff3
Update
ZhiweiYan-96 Oct 24, 2024
5b32861
Update
ZhiweiYan-96 Oct 24, 2024
2732057
Update
ZhiweiYan-96 Oct 26, 2024
fc42187
Update
ZhiweiYan-96 Oct 27, 2024
dfacc8e
Update
ZhiweiYan-96 Oct 29, 2024
969d5c6
Update
ZhiweiYan-96 Oct 29, 2024
9b64424
Update
ZhiweiYan-96 Oct 29, 2024
7fd8bcc
Update
ZhiweiYan-96 Oct 30, 2024
3431fd4
Update
ZhiweiYan-96 Nov 2, 2024
db2eca3
Update
ZhiweiYan-96 Nov 3, 2024
3103c58
Update
ZhiweiYan-96 Nov 4, 2024
88dd4a8
Update
ZhiweiYan-96 Nov 4, 2024
c77d447
Update
ZhiweiYan-96 Nov 4, 2024
d048068
Update
ZhiweiYan-96 Nov 4, 2024
a8bf0f0
Update
ZhiweiYan-96 Nov 4, 2024
39dcc4e
Update
ZhiweiYan-96 Nov 4, 2024
67c054a
Update
ZhiweiYan-96 Nov 4, 2024
03b6bba
Update
ZhiweiYan-96 Nov 5, 2024
a25333e
Update
ZhiweiYan-96 Nov 5, 2024
fe9039e
Update
ZhiweiYan-96 Nov 21, 2024
4b39daa
Update
ZhiweiYan-96 Nov 28, 2024
7890880
Update
ZhiweiYan-96 Dec 30, 2024
c9f37be
Update
ZhiweiYan-96 Dec 31, 2024
b7e1794
Update
ZhiweiYan-96 Jan 2, 2025
a9500f5
Update
ZhiweiYan-96 Jan 7, 2025
8573301
Update
ZhiweiYan-96 Jan 7, 2025
00d6d6f
Update
ZhiweiYan-96 Jan 8, 2025
1c5645c
Update
ZhiweiYan-96 Jan 8, 2025
4593a94
Update
ZhiweiYan-96 Jan 9, 2025
d2716e9
Update
ZhiweiYan-96 Jan 10, 2025
94a2ed9
Update
ZhiweiYan-96 Jan 16, 2025
30de529
Update
ZhiweiYan-96 Jan 17, 2025
1f5ed5a
Update
ZhiweiYan-96 Jan 20, 2025
9ebd45e
Update
ZhiweiYan-96 Jan 20, 2025
e577def
Update
ZhiweiYan-96 Jan 22, 2025
9f8ca53
Update
ZhiweiYan-96 Jan 23, 2025
263f371
Update
ZhiweiYan-96 Feb 10, 2025
f5aadd5
Update
guangyey Feb 10, 2025
8d7e570
Update
guangyey Feb 11, 2025
341eed1
Update
ZhiweiYan-96 Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update
[ghstack-poisoned]
  • Loading branch information
ZhiweiYan-96 committed Sep 5, 2024
commit eacaafd8a1bb0a7731723fa842a129ab5fa11e89
52 changes: 38 additions & 14 deletions aten/src/ATen/native/mkldnn/xpu/detail/QMatmul_pt2e.cpp
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the file name align with other similar file names? If yes, pls. paste the path.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the file name is changed, great appreciation for your reminding.

Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,42 @@
namespace at::native::onednn {

void quantized_matmul_pt2(
at::Tensor& result,
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& b_raw,
bool m2_trans,
double input_scale,
int64_t input_zero_point,
at::Tensor& weight_scales,
at::Tensor& weight_zero_points,
double output_scale,
int64_t output_zero_point,
Attr attr) {
at::Tensor mat1, // act
double input_scale,
int64_t input_zero_point,
at::Tensor mat2, // weight
at::Tensor& weight_scales,
at::Tensor& weight_zero_points,
at::Tensor& b_raw,
at::Tensor result, // output
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::optional<at::Tensor> other, // extra input for binary-post-op
double other_scale,
int64_t other_zero_point,
const c10::string_view& binary_post_op,
double binary_alpha,
const c10::string_view& unary_post_op,
torch::List<std::optional<at::Scalar>>& unary_post_op_args,
c10::string_view unary_post_op_algorithm){

bool m2_trans = true;

auto attr = Attr(output_scale, output_zero_point);

construct_attr_by_post_op(
binary_post_op,
binary_alpha,
input_scale,
input_zero_point,
unary_post_op,
unary_post_op_args,
unary_post_op_algorithm,
attr
);


size_t dims = result.dim();
at::Device curDevice = at::Device(at::kXPU, c10::xpu::current_device());
auto engine = GpuEngineManager::Instance().get_engine(curDevice);
Expand Down Expand Up @@ -63,7 +87,7 @@ void quantized_matmul_pt2(
if (b.dim() == 1) {
TORCH_CHECK(
b.size(0) == n || b.size(0) == 1,
"matmul supports [n] or [1] when bias dim is 1 ...");
"matmul supports [n] or [1] when bias dim is 1, but b.size() is:", b.size(0));
if (b.size(0) == 0) {
with_bias = false;
} else if (m1.dim() == 3) {
Expand Down Expand Up @@ -157,7 +181,7 @@ void quantized_matmul_pt2(
std::unordered_map<int, dnnl::memory> args;

dnnl::post_ops po;
// attr.extract_post_ops(dst, true);
attr.extract_post_ops(dst, true);
bool m1_need_zp = (input_zero_point != 0);
// wgh should never have zero point
bool wgh_is_per_channel = weight_scales.numel() > 1;
Expand Down
32 changes: 20 additions & 12 deletions aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,17 +134,25 @@ at::Tensor quantized_convolution_pt2(
c10::optional<c10::string_view> unary_algorithm);

void quantized_matmul_pt2(
at::Tensor& result,
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& b_raw,
bool m2_trans,
double input_scale,
int64_t input_zero_point,
at::Tensor& weight_scales,
at::Tensor& weight_zero_points,
double output_scale,
int64_t output_zero_point,
Attr attr);
at::Tensor mat1, // act
double input_scale,
int64_t input_zero_point,
at::Tensor mat2, // weight
at::Tensor& weight_scales,
at::Tensor& weight_zero_points,
at::Tensor& b_raw,
at::Tensor result, // output
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
std::optional<at::Tensor> other, // extra input for binary-post-op
double other_scale,
int64_t other_zero_point,
const c10::string_view& binary_post_op,
double binary_alpha,
const c10::string_view& unary_post_op,
torch::List<std::optional<at::Scalar>>& unary_post_op_args,
c10::string_view unary_post_op_algorithm);


} // namespace at::native::onednn
93 changes: 79 additions & 14 deletions aten/src/ATen/native/mkldnn/xpu/qlinear_pt2e.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <torch/library.h>

#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include "c10/core/ScalarType.h"

using namespace at::native::onednn;

Expand All @@ -22,6 +23,8 @@ Tensor q_linear_pointwise(
c10::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
c10::string_view post_op_algorithm) {


Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();

const int64_t dim = act.dim();
Expand All @@ -34,21 +37,79 @@ Tensor q_linear_pointwise(
std::vector<int64_t> dst_dims = {M, N};
Tensor qout = at::empty(dst_dims, device(c10::kXPU).dtype(c10::kByte));

Attr attr = Attr();
quantized_matmul_pt2(
act.contiguous(),
act_scale,
act_zero_point,
weight.contiguous(),
weight_scales,
weight_zero_points,
b_raw,
qout,
output_scale,
output_zero_point,
output_dtype,
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/0,
/*binary post op*/ "none",
/*binary alpha*/1.0,
post_op_name,
post_op_args,
post_op_algorithm
);

return qout;
}

Tensor q_linear_pointwise_tensor(
Tensor act,
Tensor act_scale,
Tensor act_zero_point,
Tensor weight,
Tensor weight_scales,
Tensor weight_zero_points,
std::optional<Tensor> bias,
double output_scale,
int64_t output_zero_point,
std::optional<c10::ScalarType> output_dtype,
c10::string_view post_op_name,
torch::List<std::optional<at::Scalar>> post_op_args,
c10::string_view post_op_algorithm
){
Tensor b_raw = bias.has_value() ? bias.value() : at::Tensor();

const int64_t dim = act.dim();
int64_t K = act.size(dim - 1);
int64_t M = act.numel() / K;
// [M, K] x [K, N]
int64_t N = weight.size(1);

std::vector<int64_t> src_dims = {M, K};
std::vector<int64_t> dst_dims = {M, N};
Tensor qout = at::empty(dst_dims, device(c10::kXPU).dtype(c10::kByte));

quantized_matmul_pt2(
qout,
act,
weight,
b_raw,
/*m2_trans=*/false,
act_scale,
act_zero_point,
weight_scales,
weight_zero_points,
output_scale,
output_zero_point,
attr);
act.contiguous(),
act_scale.item().toDouble(),
act_zero_point.item().toLong(),
weight.contiguous(),
weight_scales,
weight_zero_points,
b_raw,
qout,
output_scale,
output_zero_point,
output_dtype,
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/0,
/*binary post op*/ "none",
/*binary alpha*/1.0,
post_op_name,
post_op_args,
post_op_algorithm
);

return qout;
}
Expand All @@ -57,14 +118,18 @@ Tensor q_linear_pointwise(
at::Tensor q_linear_prepack_onednn(
at::Tensor weight,
c10::optional<torch::List<int64_t>> input_shape) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

input_shape is useless?

Copy link
Collaborator Author
@ZhiweiYan-96 ZhiweiYan-96 Jan 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but we need to keep this argument, as the input_shape is defined in op schema. We cannot register the OP if we directly removed the arg.

Why input_shape is shown in argument list:
x86InductorQuantizer will uses this shape to guess a proper block format weight. But this is not required in XPU side, what we need is only to transpose the weight.

return weight;
at::Tensor weight_transposed = weight.transpose(0, 1);
return weight_transposed;
}


TORCH_LIBRARY_IMPL(onednn, XPU, m) {
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
TORCH_FN(q_linear_pointwise));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
TORCH_FN(q_linear_pointwise_tensor));
m.impl(
TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
TORCH_FN(q_linear_prepack_onednn));
Expand Down
20 changes: 10 additions & 10 deletions test/inductor/test_mkldnn_pattern_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,7 @@ def matcher_check_fn():
def _qlinear_cpu_test_helper(
self,
inputs,
device="cpu",
int8_mixed_bf16=False,
do_permute=False,
matcher_check_fn=None,
Expand All @@ -1438,7 +1439,7 @@ def forward(self, x):
x = torch.reshape(torch.permute(x, (0, 2, 3, 1)), (2, 12, 4))
return self.linear2(self.linear(x))

mod = M(bias, do_permute=do_permute).eval().xpu()
mod = M(bias, do_permute=do_permute).eval().to(device=device)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
mod = M(bias, do_permute=do_permute).eval().to(device=device)
mod = M(bias, do_permute=do_permute).eval().to(device=device)
assert isinstance(inputs, Tuple)
def __convert_tensor_to_device(input: Any, device: str):
return input.to(device=device) if isinstance(input, Tensor) else input
inputs = tuple(__convert_tensor_to_device(input, device) for input in inputs)

Copy link
Collaborator Author
@ZhiweiYan-96 ZhiweiYan-96 Feb 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

modified, thanks


def _default_matcher_check_fn():
self.assertEqual(
Expand All @@ -1459,12 +1460,12 @@ def _default_matcher_check_fn():

@skipIfNoDynamoSupport
@skipIfNoONEDNN
def test_qlinear_cpu(self):
def test_qlinear_mkldnn(self, device="cpu"):
r"""
This testcase will quantize a single Linear Moduel.
"""
for bias in [True, False]:
self._qlinear_cpu_test_helper((torch.randn((2, 4)).xpu(),), bias=bias)
self._qlinear_cpu_test_helper((torch.randn((2, 4)).to(device=device),), device=device, bias=bias)

@skipIfNoDynamoSupport
@skipIfNoONEDNN
Expand Down Expand Up @@ -1587,7 +1588,7 @@ def matcher_check_fn():
)

def _qlinear_unary_cpu_test_helper(
self, inputs, unary_op=torch.nn.ReLU(), int8_mixed_bf16=False
self, inputs, unary_op=torch.nn.ReLU(), device="cpu", int8_mixed_bf16=False
):
class M(torch.nn.Module):
def __init__(self, use_bias):
Expand All @@ -1603,7 +1604,7 @@ def forward(self, x):

bias_list = [True, False]
for bias in bias_list:
mod = M(bias).eval()
mod = M(bias).eval().to(device=device)

def matcher_check_fn():
# 1. dequant-linear pattern matched in quantization weight prepack
Expand All @@ -1623,11 +1624,11 @@ def matcher_check_fn():

@skipIfNoDynamoSupport
@skipIfNoONEDNN
def test_qlinear_relu_cpu(self):
def test_qlinear_relu_mkldnn(self, device="cpu"):
r"""
This testcase will quantize a Linear->ReLU pattern.
"""
self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)),))
self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)).to(device=device),), device=device)

@skipIfNoDynamoSupport
@skipIfNoONEDNNBF16
Expand Down Expand Up @@ -1661,12 +1662,12 @@ def test_qlinear_relu_int8_mixed_bf16_input_dim_exceeds_2(self):

@skipIfNoDynamoSupport
@skipIfNoONEDNN
def test_qlinear_gelu_cpu(self):
def test_qlinear_gelu_mkldnn(self, device="cpu"):
r"""
This testcase will quantize a Linear->GELU pattern.
"""
for gelu in [torch.nn.GELU("none"), torch.nn.GELU("tanh")]:
self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)),), gelu)
self._qlinear_unary_cpu_test_helper((torch.randn((2, 4)).to(device=device),), gelu, device=device)

@skipIfNoDynamoSupport
@skipIfNoONEDNNBF16
Expand Down Expand Up @@ -2831,4 +2832,3 @@ def matcher_check_fn():
if __name__ == "__main__":
if IS_LINUX and HAS_CPU and torch.backends.mkldnn.is_available():
run_tests()
1
Loading
0