Update on "[inductor] weight prepack for _convolution_transpose_point…

…wise" cc VitalyFedyunin jgong5 mingfeima XiaobingSuper sanchitintel ashokei jingxu10 mlazos soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang Guobing-Chen zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 desertfire [ghstack-poisoned]
pytorch · chunyuan-w · Dec 6, 2022 · Dec 6, 2022 · Dec 6, 2022 · Dec 6, 2022
commit 0335b8acfb9f45b40c031e51dc77f0c4947229c3
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
@@ -3349,71 +3349,6 @@ def get_template_tiling(self):
         )
 
 
-def _prepare_convolution_fusion_create(
-    cls,
-    x: "TensorBox",
-    weight: "TensorBox",
-    bias: "TensorBox",
-    padding_: List[int],
-    stride_: List[int],
-    dilation_: List[int],
-    groups: int,
-    transposed: bool = False,
-    output_padding_: List[int] = None,
-):
-    """
-    This function is a helper function to prepare inputs, layout and constant args
-    for convolution post-op fusion's create function, including deciding the output
-    layout (channels first or channels last), realizing inputs and make them etc. The
-    function only supports the CPU device since conv post-op fusion kernel is only
-    supported on CPU right now.
-    """
-    stride = tuple(stride_)
-    padding = tuple(padding_)
-    dilation = tuple(dilation_)
-    assert isinstance(groups, int)
-    output_padding = tuple(output_padding_) if output_padding_ else (0, 0)
-    with torch._subclasses.FakeTensorMode():
-        x_fake = ir_node_to_tensor(x, guard_shape=True)
-        weight_fake = ir_node_to_tensor(weight, guard_shape=True)
-        bias_fake = (
-            ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
-        )
-        output = torch.ops.aten.convolution(
-            x_fake,
-            weight_fake,
-            bias_fake,
-            stride,
-            padding,
-            dilation,
-            False,
-            [0, 0],
-            groups,
-        )
-        output_size = output.size()
-        req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
-        req_stride_order = [len(req_stride_order)] + req_stride_order
-        output_stride = make_channels_last_strides_for(output_size)
-
-    x = cls.require_stride_order(x, req_stride_order)
-    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
-    inputs = [x, weight]
-
-    kernel_layout = FixedLayout(
-        x.get_device(),
-        x.get_dtype(),
-        output.size(),
-        output_stride,
-    )
-    constant_args = [padding, stride, dilation, groups]
-
-    if bias is not None:
-        inputs.append(bias)
-    else:
-        constant_args.insert(0, bias)
-    return inputs, constant_args, kernel_layout, req_stride_order
-
-
 # Port from aten/src/ATen/native/ConvUtils.h: _conv_input_size
 def _conv_input_size(
     output_size, weight_size, padding, output_padding, stride, dilation, groups
@@ -3461,7 +3396,7 @@ def _original_deconv_weight_size(
     return weight_size
 
 
-def _prepare_convolution_transpose_fusion_create(
+def _prepare_convolution_fusion_create(
     cls,
     x: "TensorBox",
     weight: "TensorBox",
@@ -3470,11 +3405,12 @@ def _prepare_convolution_transpose_fusion_create(
     stride_: List[int],
     dilation_: List[int],
     groups: int,
-    output_padding_: List[int],
+    transposed: bool = False,
+    output_padding_: List[int] = None,
 ):
     """
     This function is a helper function to prepare inputs, layout and constant args
-    for convolution_transpose post-op fusion's create function, including deciding the output
+    for convolution post-op fusion's create function, including deciding the output
     layout (channels first or channels last), realizing inputs and make them etc. The
     function only supports the CPU device since conv post-op fusion kernel is only
     supported on CPU right now.
@@ -3483,20 +3419,47 @@ def _prepare_convolution_transpose_fusion_create(
     padding = tuple(padding_)
     dilation = tuple(dilation_)
     assert isinstance(groups, int)
-    output_padding = tuple(output_padding_)
+    output_padding = tuple(output_padding_) if output_padding_ else (0, 0)
     with torch._subclasses.FakeTensorMode():
         x_fake = ir_node_to_tensor(x, guard_shape=True)
         weight_fake = ir_node_to_tensor(weight, guard_shape=True)
 
-        weight_size = _original_deconv_weight_size(weight_fake, groups)
-        input_size = x_fake.size()
-        output_size = _conv_input_size(
-            input_size, weight_size, padding, output_padding, stride, dilation, groups
-        )
+        if transposed:
+            # When transposed, the size of the prepacked oneDNN weight is different
+            # from the PyTorch weight. We're not able to run aten conv with such
+            # size. We infer the output size from the input params here:
+            weight_size = _original_deconv_weight_size(weight_fake, groups)
+            input_size = x_fake.size()
+            output_size = _conv_input_size(
+                input_size,
+                weight_size,
+                padding,
+                output_padding,
+                stride,
+                dilation,
+                groups,
+            )
+        else:
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.convolution(
+                x_fake,
+                weight_fake,
+                bias_fake,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                groups,
+            )
+            output_size = output.size()
 
         req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
         req_stride_order = [len(req_stride_order)] + req_stride_order
         output_stride = make_channels_last_strides_for(output_size)
+
     x = cls.require_stride_order(x, req_stride_order)
     assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
     inputs = [x, weight]