8000 Update on "[Cutlass] Implement Epilogue Argument emitter" · pytorch/pytorch@b34ea02 · GitHub
[go: up one dir, main page]

Skip to content

Commit b34ea02

Browse files
committed
Update on "[Cutlass] Implement Epilogue Argument emitter"
This implements epilogue visitor tree argument generation (example type [here](https://github.com/NVIDIA/cutlass/blob/3fe62887d8dd75700fdaf57f9c181878701b0802/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp#L332)). Details: The codegen task here is to implement a function which can generate a tree of C++ structs and properly extract the correct properties from Inductor buffers and write them to the correct locations in the generated struct. To implement this with the minimum amount of code, I generate the cutlass DAGIR (the EVT internal represenation) which specifically has a pass, [pass_argument_type.py ](https://github.com/NVIDIA/cutlass/blob/5e497243f7ad13a2aa842143f9b10bbb23d98292/python/cutlass/backend/evt/passes/pass_argument_type.py#L4) which generates a nested tree of custom argument types for each node in the DAGIR. This nested tree of constructors is then passed kwargs to fill in the proper values, where the node's name is used to differentiate between different values in the kwarg dictionary. This however is non-customizable; the nested tree of EVT args is a nested tree of ctypes which looks for *actual values* so that this object can be passed directly to the cutlass-python C++ runner. Inductor on the other hand needs to fill this struct with string C++ expressions representing the values (or extracting the values from kernel launcher args). So `_render_argument_type` implements this: it iterates over the tree of types created by pass_argument_type.py and generates a string representing the nested structs, filling in C++ expressions representing the different fields. Long term plan: Long term, I will ask the nvidia to provide an overridable [visitor_factory](https://github.com/NVIDIA/cutlass/blob/5e497243f7ad13a2aa842143f9b10bbb23d98292/python/cutlass/backend/evt/passes/pass_argument_type.py#L82) which could allow us to override the behavior of pass_argument_type.py to generate the string we would like during DAGIR generation. Previously merged: * #150346 * #150345 * #150344 cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov [ghstack-poisoned]
2 parents 750b7f5 + 119573c commit b34ea02

File tree

93 files changed

+1937
-1162
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1937
-1162
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
4022ff142a5392aa5197e05f4dfe85d356f742bf
1+
17cbef50fd4ac8488632367a864aa01a2c0019ef

.ci/docker/requirements-docs.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
sphinx==5.3.0
22
#Description: This is used to generate PyTorch docs
33
#Pinned versions: 5.3.0
4-
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
4+
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@c49afc2aff734d40813b0ca182bb49b611d7a30c#egg=pytorch_sphinx_theme2
55

66
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
77
# but it doesn't seem to work and hangs around idly. The initial thought is probably
88
# something related to Docker setup. We can investigate this later
9+
910
sphinxcontrib.katex==0.8.6
1011
#Description: This is used to generate PyTorch docs
1112
#Pinned versions: 0.8.6
1213

14+
sphinxext-opengraph==0.9.1
15+
#Description: This is used to generate PyTorch docs
16+
#Pinned versions: 0.9.1
17+
1318
matplotlib==3.5.3
1419
#Description: This is used to generate PyTorch docs
1520
#Pinned versions: 3.5.3
@@ -46,5 +51,6 @@ myst-nb==0.17.2
4651
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
4752
python-etcd==0.4.5
4853
sphinx-copybutton==0.5.0
49-
sphinx-panels==0.4.1
54+
sphinx-design==0.4.0
55+
sphinxcontrib-mermaid==1.0.0
5056
myst-parser==0.18.1

.ci/pytorch/macos-test.sh

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -221,25 +221,27 @@ test_torchbench_smoketest() {
221221
TEST_REPORTS_DIR=$(pwd)/test/test-reports
222222
mkdir -p "$TEST_REPORTS_DIR"
223223

224-
local backend=eager
225224
local dtype=notset
226225
local device=mps
227-
228-
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
229-
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
230-
231-
echo "Setup complete, launching torchbench training performance run"
232-
for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
233-
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
234-
10000 --performance --only "$model" --backend "$backend" --training --devices "$device" \
235-
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
236-
done
237-
238-
echo "Launching torchbench inference performance run"
239-
for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
240-
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
241-
--performance --only "$model" --backend "$backend" --inference --devices "$device" \
242-
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
226+
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152)
227+
228+
for backend in eager inductor; do
229+
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
230+
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
231+
232+
echo "Launching torchbench training performance run for backend ${backend}"
233+
for model in "${models[@]}"; do
234+
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
235+
--performance --only "$model" --backend "$backend" --training --devices "$device" \
236+
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
237+
done
238+
239+
echo "Launching torchbench inference performance run for backend ${backend}"
240+
for model in "${models[@]}"; do
241+
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
242+
--performance --only "$model" --backend "$backend" --inference --devices "$device" \
243+
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
244+
done
243245
done
244246

245247
echo "Pytorch benchmark on mps device completed"

.ci/pytorch/python_doc_push_script.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,6 @@ popd
119119
git rm -rf "$install_path" || true
120120
mv "$pt_checkout/docs/build/html" "$install_path"
121121

122-
# Prevent Google from indexing $install_path/_modules. This folder contains
123-
# generated source files.
124-
# NB: the following only works on gnu sed. The sed shipped with mac os is different.
125-
# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
126-
find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
127-
128122
git add "$install_path" || true
129123
git status
130124
git config user.email "soumith+bot@pytorch.org"
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
name: Binary docker build
2+
3+
description: Build docker image for binary builds
4+
5+
inputs:
6+
docker-image-name:
7+
description: Docker image name for PR builds
8+
required: true
9+
docker-build-dir:
10+
description: Location of the build.sh relative to .ci/docker
11+
required: true
12+
custom-tag-prefix:
13+
description: Custom tag prefix for the docker image
14+
required: false
15+
DOCKER_TOKEN:
16+
description: Docker token for authentication
17+
required: true
18+
DOCKER_ID:
19+
description: Docker ID for authentication
20+
required: true
21+
22+
runs:
23+
using: composite
24+
steps:
25+
- name: Checkout PyTorch
26+
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
27+
28+
- name: Calculate docker image
29+
id: calculate-docker-image
30+
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
31+
with:
32+
docker-image-name: ${{ inputs.docker-image-name }}
33+
docker-build-dir: .ci/docker
34+
custom-tag-pre 10000 fix: ${{ inputs.custom-tag-prefix }}
35+
docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
36+
always-rebuild: true
37+
push: true
38+
39+
- name: Tag and (if WITH_PUSH) push docker image to docker.io
40+
env:
41+
DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
42+
DOCKER_ID: ${{ inputs.DOCKER_ID }}
43+
DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
44+
DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
45+
CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
46+
shell: bash
47+
run: |
48+
set -euox pipefail
49+
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
50+
GIT_BRANCH_NAME=${GITHUB_REF##*/}
51+
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
52+
CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
53+
54+
DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
55+
56+
docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
57+
docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
58+
docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
59+
60+
# Pretty sure Github will mask tokens and I'm not sure if it will even be
61+
# printed due to pipe, but just in case
62+
set +x
63+
if [[ ${WITH_PUSH:-false} == "true" ]]; then
64+
echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
65+
docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
66+
docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
67+
docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
68+
fi

.github/workflows/inductor-perf-test-nightly-macos.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
name: perf-nightly-macos
2-
# Technically not an inductor test, but uses it as a template for tracking macos performance
1+
name: inductor-perf-nightly-macos
32

43
on:
54
schedule:
@@ -24,6 +23,7 @@ on:
2423
pull_request:
2524
paths:
2625
- .github/workflows/inductor-perf-test-nightly-macos.yml
26+
- .ci/pytorch/macos-test.sh
2727

2828
concurrency:
2929
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}

aten/src/ATen/core/CachingHostAllocator.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -620,12 +620,17 @@ struct CachingHostAllocatorImpl {
620620
alignas(64) HostStatsStaged stats_;
621621
};
622622

623-
template <typename T>
623+
template <typename T, c10::DeleterFnPtr deleteFunc>
624624
struct CachingHostAllocatorInterface : public at::Allocator {
625625
CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
626626

627627
at::DataPtr allocate(size_t size) override {
628-
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
628+
auto ptr_and_ctx = impl_->allocate(size);
629+
return {
630+
ptr_and_ctx.first,
631+
ptr_and_ctx.second,
632+
deleteFunc, // Use the template parameter deleter function
633+
at::DeviceType::CPU};
629634
}
630635

631636
void free(void* ctx) {
@@ -661,5 +666,9 @@ struct CachingHostAllocatorInterface : public at::Allocator {
661666
std::unique_ptr<T> impl_;
662667
};
663668

669+
#define DECLARE_HOST_ALLOCATOR(name, impl, deleter) \
670+
struct name final \
671+
: public at::CachingHostAllocatorInterface<impl, deleter> {};
672+
664673
} // namespace at
665674
C10_DIAGNOSTIC_POP()

aten/src/ATen/cuda/CachingHostAllocator.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -251,17 +251,7 @@ struct CUDACachingHostAllocatorImpl
251251

252252
void raw_local_deleter(void* ptr);
253253

254-
struct CUDACachingHostAllocator final
255-
: public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
256-
at::DataPtr allocate(size_t size) override {
257-
auto ptr_and_ctx = impl_->allocate(size);
258-
return {
259-
ptr_and_ctx.first,
260-
ptr_and_ctx.second,
261-
&raw_local_deleter,
262-
at::DeviceType::CPU};
263-
}
264-
};
254+
DECLARE_HOST_ALLOCATOR(CUDACachingHostAllocator, CUDACachingHostAllocatorImpl, raw_local_deleter);
265255

266256
CUDACachingHostAllocator caching_host_allocator;
267257

aten/src/ATen/native/layer_norm.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <ATen/ops/empty_like.h>
1717
#include <ATen/ops/empty_like_native.h>
1818
#include <ATen/ops/layer_norm_native.h>
19+
#include <ATen/ops/_fused_rms_norm.h>
1920
#include <ATen/ops/native_batch_norm.h>
2021
#include <ATen/ops/native_layer_norm.h>
2122
#include <ATen/ops/native_layer_norm_backward_native.h>
@@ -27,7 +28,6 @@
2728
#endif
2829

2930
#ifdef USE_MPS
30-
#include <ATen/native/mps/operations/RMSNorm.h>
3131
#include <c10/core/GradMode.h>
3232
#endif
3333

@@ -281,7 +281,7 @@ Tensor rms_norm_symint(
281281

282282
if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
283283
auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
284-
return mps::rms_norm_mps_kernel(input.contiguous(), normalized_shape, weight.contiguous(), eps_val);
284+
return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
285285
}
286286
}
287287
#endif

aten/src/ATen/native/mps/kernels/UnaryKernel.metal

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,29 @@ struct sqrt_functor {
6767
}
6868
};
6969

70+
struct bitwise_not_functor {
71+
template <typename T>
72+
inline enable_if_t<!is_same_v<T, bool> && is_scalar_integral_v<T>, T>
73+
operator()(const T x) {
74+
return ~x;
75+
}
76+
77+
template <typename T>
78+
inline enable_if_t<is_same_v<T, bool>, T> operator()(const T x) {
79+
return !x;
80+
}
81+
};
82+
7083
DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
7184
DEFINE_UNARY_FLOATING_FUNCTOR(sinc);
7285

86+
REGISTER_UNARY_OP(bitwise_not, int, int);
87+
REGISTER_UNARY_OP(bitwise_not, long, long);
88+
REGISTER_UNARY_OP(bitwise_not, short, short);
89+
REGISTER_UNARY_OP(bitwise_not, char, char);
90+
REGISTER_UNARY_OP(bitwise_not, uchar, uchar);
91+
REGISTER_UNARY_OP(bitwise_not, bool, bool);
92+
7393
#define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
7494
REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0); \
7595
REGISTER_UNARY_OP(exp, DTYPE1, DTYPE0); \

aten/src/ATen/native/mps/operations/BitwiseOps.mm

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <ATen/native/Resize.h>
66
#include <ATen/native/mps/OperationUtils.h>
77
#include <ATen/ops/bitwise_and_native.h>
8-
#include <ATen/ops/bitwise_not_native.h>
98
#include <ATen/ops/bitwise_or_native.h>
109
#include <ATen/ops/bitwise_xor_native.h>
1110
#include <ATen/ops/logical_not_native.h>
@@ -100,11 +99,6 @@ kernel void bitwise_rshift_scalar_tensor(device {0} *out [[buffer(0)]],
10099
out[offset] = static_cast<{0}>(a) >> b[offset];
101100
}}
102101
103-
kernel void bitwise_not(device {0} *out [[buffer(0)]],
104-
constant {1} *a [[buffer(1)]],
105-
uint offset [[thread_position_in_grid]]) {{
106-
out[offset] = ~a[offset];
107-
}}
108102
)METAL",
109103
3);
110104

@@ -200,54 +194,6 @@ static void _bitwise_op_out_mps(const Tensor& self,
200194
return;
201195
}
202196

203-
static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
204-
// Handle boolean tensor using logical not
205-
if (self.scalar_type() == c10::ScalarType::Bool) {
206-
logical_not_out_mps(self, const_cast<Tensor&>(output_));
207-
return;
208-
}
209-
210-
Tensor output = output_;
211-
bool needs_output_copy = false;
212-
213-
resize_output(output, self.sizes());
214-
if (needsGather(output)) {
215-
output = output.contiguous();
216-
needs_output_copy = true;
217-
}
218-
if (self.dim() == 0) {
219-
if (self.scalar_type() == c10::ScalarType::Byte) {
220-
// Unsigned types need a special handling to keep result of operation in 0..255 output
221-
output.fill_(c10::Scalar(static_cast<uint8_t>(~self.item<uint8_t>())));
222-
} else {
223-
output.fill_(c10::Scalar(~self.item<int64_t>()));
224-
}
225-
return;
226-
}
227-
uint32_t length = output.numel();
228-
if (length == 0) {
229-
return;
230-
}
231-
using namespace at::mps;
232-
MPSStream* stream = getCurrentMPSStream();
233-
auto cplState = getCPLState(output, self, self, "bitwise_not");
234-
dispatch_sync(stream->queue(), ^() {
235-
getMPSProfiler().beginProfileKernel(cplState, "bitwise_not", {self});
236-
237-
id<MTLComputeCommandEncoder> commandEncoder = stream->commandEncoder();
238-
239-
[commandEncoder pushDebugGroup:@"Dispatch bitwise_not kernel"];
240-
[commandEncoder setComputePipelineState:cplState];
241-
mtl_setArgs(commandEncoder, output, self);
242-
mtl_dispatch1DJob(commandEncoder, cplState, length);
243-
244-
getMPSProfiler().endProfileKernel(cplState);
245-
});
246-
if (needs_output_copy) {
247-
output_.copy_(output);
248-
}
249-
}
250-
251197
} // namespace mps
252198
namespace {
253199
void lshift_kernel_mps(TensorIteratorBase& iter) {
@@ -272,10 +218,6 @@ void rshift_kernel_mps(TensorIteratorBase& iter) {
272218
mps::_bitwise_op_out_mps(self, other, output, "xor");
273219
}
274220

275-
TORCH_IMPL_FUNC(bitwise_not_out_mps)(const Tensor& self, const Tensor& output) {
276-
mps::_bitwise_not_out_mps(self, output);
277-
}
278-
279221
REGISTER_MPS_DISPATCH(lshift_stub, &lshift_kernel_mps)
280222
REGISTER_MPS_DISPATCH(rshift_stub, &rshift_kernel_mps)
281223

aten/src/ATen/native/mps/operations/RMSNorm.h

Lines changed: 0 additions & 14 deletions
This file was deleted.

0 commit comments

Comments
 (0)
0