8000 Update on "[AOTAutograd] tweak min-cut partitioner to avoid saving so… · pytorch/pytorch@6121330 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6121330

Browse files
committed
Update on "[AOTAutograd] tweak min-cut partitioner to avoid saving softmax output"
Right now the linear + cross entropy loss operation (usually to be the last part of a transformer model) does the following thing 1. run matmul to get softmax_input 2. load softmax_input to compute max per row. 3. load softmax_input to compute sum per row 4. load softmax_input, normalize it and save the result to softmax_output Step 4 is inefficient since a. in the fwd pass, only a small slice of the softmax_output tensor is need to compute NLLLoss. Materializing the whole tensor is an overkill b. in the backward pass, we need the whole softmax_output, but it can be recompute from softmax_input If we skip saving softmax_output, we would have perf wins since this is the largest tensor in the network. For llm.c, the size is batch_size * sequence_length * vocab_size * item_size ~= 32 * 1024 * 50257 * 2 ~= 3GB. Simply read/write such large tensor need ~2ms in A100. If we recompute softmax_output, we save 1 load for softmax_input and 1 store for softmax_output, which would result in ~4ms saving. To avoid saving the softmax_output we need make sure the min cut partitioner decides to recompute it based on softmax_input and the max/sum tensor (which is small) computed in step 2 and 3. This is not happening currently since the min cut partitioner over-estimate the cost of recomputation. The fix is suggested by Chillee to let `dist_from_bw` play a less important role. [ghstack-poisoned]
2 parents c667a9d + 490da62 commit 6121330

File tree

266 files changed

+10019
-7169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

266 files changed

+10019
-7169
lines changed

.ci/docker/build.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,21 @@ case "$image" in
149149
TRITON=yes
150150
INDUCTOR_BENCHMARKS=yes
151151
;;
152+
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
153+
CUDA_VERSION=12.1.1
154+
CUDNN_VERSION=8
155+
ANACONDA_PYTHON_VERSION=3.12
156+
GCC_VERSION=9
157+
PROTOBUF=yes
158+
DB=yes
159+
VISION=yes
160+
KATEX=yes
161+
UCX_COMMIT=${_UCX_COMMIT}
162+
UCC_COMMIT=${_UCC_COMMIT}
163+
CONDA_CMAKE=yes
164+
TRITON=yes
165+
INDUCTOR_BENCHMARKS=yes
166+
;;
152167
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
153168
CUDA_VERSION=11.8.0
154169
CUDNN_VERSION=8

.ci/docker/common/install_acl.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
set -euo pipefail
22

3-
readonly version=v23.08
3+
readonly version=v24.04
44
readonly src_host=https://review.mlplatform.org/ml
55
readonly src_repo=ComputeLibrary
66

.github/workflows/docker-builds.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ jobs:
4242
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
4343
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
4444
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
45+
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
4546
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
4647
pytorch-linux-focal-py3.8-clang10,
4748
pytorch-linux-focal-py3.11-clang10,

.github/workflows/inductor.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,27 @@ jobs:
107107
secrets:
108108
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
109109

110+
linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
111+
name: cuda12.1-py3.12-gcc9-sm86
112+
uses: ./.github/workflows/_linux-build.yml
113+
with:
114+
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
115+
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
116+
cuda-arch-list: '8.6'
117+
test-matrix: |
118+
{ include: [
119+
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
120+
]}
121+
122+
linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
123+
name: cuda12.1-py3.12-gcc9-sm86
124+
uses: ./.github/workflows/_linux-test.yml
125+
needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
126+
with:
127+
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
128+
docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
129+
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
130+
110131
linux-jammy-cpu-py3_8-gcc11-inductor-build:
111132
name: linux-jammy-cpu-py3.8-gcc11-inductor
112133
uses: ./.github/workflows/_linux-build.yml

.github/workflows/trunk.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ jobs:
194194
{ include: [
195195
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
196196
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
197+
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
197198
]}
198199
199200
linux-focal-rocm6_1-py3_8-test:
@@ -209,4 +210,4 @@ jobs:
209210
build-environment: linux-focal-rocm6.1-py3.8
210211
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
211212
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
212-
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
213+
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Upload test stats intermediate
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
workflow_id:
7+
description: workflow_id of the run
8+
required: true
9+
workflow_run_attempt:
10+
description: workflow_run_attempt of the run
11+
required: true
12+
13+
jobs:
14+
intermediate_upload_test_stats:
15+
name: Intermediate upload test stats for ${{ inputs.workflow_id }}
16+
runs-on: ubuntu-22.04
17+
environment: upload-stats
18+
steps:
19+
- name: Checkout PyTorch
20+
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
21+
with:
22+
fetch-depth: 1
23+
submodules: false
24+
25+
- uses: actions/setup-python@v4
26+
with:
27+
python-version: '3.11'
28+
cache: pip
29+
30+
- run: |
31+
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
32+
33+
- name: Upload test stats
34+
env:
35+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
36+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
37+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
38+
WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
39+
WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
40+
run: |
41+
python3 -m tools.stats.upload_test_stats_intermediate \
42+
--workflow-run-id "${WORKFLOW_RUN_ID}" \
43+
--workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \

.gitmodules

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
ignore = dirty
33
path = third_party/pybind11
44
url = https://github.com/pybind/pybind11.git
5-
[submodule "third_party/cub"]
6-
ignore = dirty
7-
path = third_party/cub
8-
url = https://github.com/NVlabs/cub.git
95
[submodule "third_party/eigen"]
106
ignore = dirty
117
path = third_party/eigen
@@ -50,10 +46,6 @@
5046
ignore = dirty
5147
path = third_party/psimd
5248
url = https://github.com/Maratyszcza/psimd.git
53-
[submodule "third_party/zstd"]
54-
ignore = dirty
55-
path = third_party/zstd
56-
url = https://github.com/facebook/zstd.git
5749
[submodule "third_party/cpuinfo"]
5850
ignore = dirty
5951
path = third_party/cpuinfo
@@ -66,10 +58,6 @@
6658
ignore = dirty
6759
path = third_party/onnx
6860
url = https://github.com/onnx/onnx.git
69-
[submodule "third_party/onnx-tensorrt"]
70-
ignore = dirty
71-
path = third_party/onnx-tensorrt
72-
url = https://github.com/onnx/onnx-tensorrt
7361
[submodule "third_party/sleef"]
7462
ignore = dirty
7563
path = third_party/sleef
@@ -152,3 +140,7 @@
152140
[submodule "third_party/opentelemetry-cpp"]
153141
path = third_party/opentelemetry-cpp
154142
url = https://github.com/open-telemetry/opentelemetry-cpp.git
143+
[submodule "third_party/cpp-httplib"]
144+
path = third_party/cpp-httplib
145+
url = https://github.com/yhirose/cpp-httplib.git
146+
branch = v0.15.3

.lintrunner.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,8 +1929,6 @@ exclude_patterns = [
19291929
'torch/utils/_mode_utils.py',
19301930
'torch/utils/_python_dispatch.py',
19311931
'torch/utils/_stats.py',
1932-
'torch/utils/_sympy/__init__.py',
1933-
'torch/utils/_sympy/functions.py',
19341932
'torch/utils/_traceback.py',
19351933
'torch/utils/_zip.py',
19361934
'torch/utils/backcompat/__init__.py',

BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ cc_library(
772772
[
773773
"torch/*.h",
774774
"torch/csrc/**/*.h",
775-
"torch/csrc/distributed/c10d/*.hpp",
775+
"torch/csrc/distributed/c10d/**/*.hpp",
776776
"torch/lib/libshm/*.h",
777777
],
778778
exclude = [

CMakeLists.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,6 @@ option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
265265
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
266266
option(USE_SYSTEM_EIGEN_INSTALL
267267
"Use system Eigen instead of the one under third_party" OFF)
268-
option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
269268
cmake_dependent_option(
270269
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
271270
"LINUX" OFF)
@@ -279,11 +278,13 @@ endif()
279278
option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
280279
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
281280
option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
281+
cmake_dependent_option(
282+
USE_LITE_AOTI "Include AOTI sources" OFF
283+
"BUILD_LITE_INTERPRETER" OFF)
282284
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
283285
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
284286
# option USE_XNNPACK: try to enable xnnpack by default.
285287
option(USE_XNNPACK "Use XNNPACK" ON)
286-
option(USE_ZSTD "Use ZSTD" OFF)
287288
option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
288289
# Ensure that an ITT build is the default for x86 CPUs
289290
cmake_dependent_option(
@@ -413,7 +414,6 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
413414
option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
414415
option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
415416
option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
416-
option(USE_SYSTEM_ZSTD "Use system-provided zstd." OFF)
417417
option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
418418
if(USE_SYSTEM_LIBS)
419419
set(USE_SYSTEM_CPUINFO ON)
@@ -435,9 +435,6 @@ if(USE_SYSTEM_LIBS)
435435
if(USE_TBB)
436436
set(USE_SYSTEM_TBB ON)
437437
endif()
438-
if(USE_ZSTD)
439-
set(USE_SYSTEM_ZSTD ON)
440-
endif()
441438
endif()
442439

443440
# Used when building Caffe2 through setup.py

0 commit comments

Comments
 (0)
0