8000 [CI] Build sm89 with more procs experiment (#147487) · pytorch/pytorch@995b125 · GitHub
[go: up one dir, main page]

Skip to content

Commit 995b125

Browse files
clee2000pytorchmergebot
authored andcommitted
[CI] Build sm89 with more procs experiment (#147487)
Add a build that uses 4 out of the 8 processes available on a linux.2xlarge/c5.2xlarge. Currently it's set to 2 because it would oom, but I'm curious as to how often people's builds oom. I can't test this on my own because of caching, so it has to run on pull request This might result in a failing job on may people's PRs and I'm not sure how to get around it. I named it stable to make it automatically get sorted into the stable group for Dr. CI but it'll still show up Pull Request resolved: #147487 Approved by: https://github.com/huydhn
1 parent 7c8c82c commit 995b125

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

.ci/pytorch/build.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ fi
191191

192192
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
193193
# memory to build and will OOM
194-
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
194+
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
195195
echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
196196
echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
197197
export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@@ -377,8 +377,10 @@ else
377377
# This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
378378
# is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
379379
# 16 CPUs
380-
MAX_JOBS=$(nproc --ignore=4)
381-
export MAX_JOBS
380+
if [ -z "$MAX_JOBS_OVERRIDE" ]; then
381+
MAX_JOBS=$(nproc --ignore=4)
382+
export MAX_JOBS
383+
fi
382384

383385
# NB: Install outside of source directory (at the same level as the root
384386
# pytorch folder) so that it doesn't get cleaned away prior to docker push.

.github/workflows/_linux-build.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ on:
7676
required: false
7777
type: boolean
7878
default: false
79+
max-jobs:
80+
description: |
81+
Overwrite the number of jobs to use for the build
82+
required: false
83+
type: string
7984

8085
secrets:
8186
HUGGING_FACE_HUB_TOKEN:
@@ -211,6 +216,7 @@ jobs:
211216
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
212217
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
213218
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
219+
MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
214220
run: |
215221
START_TIME=$(date +%s)
216222
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -230,6 +236,12 @@ jobs:
230236
DOCKER_SHELL_CMD=
231237
fi
232238
239+
if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
240+
MAX_JOBS="$(nproc --ignore=2)"
241+
else
242+
MAX_JOBS="${MAX_JOBS_OVERRIDE}"
243+
fi
244+
233245
# Leaving 1GB for the runner and other things
234246
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
235247
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@@ -241,7 +253,8 @@ jobs:
241253
# shellcheck disable=SC2086
242254
container_name=$(docker run \
243255
-e BUILD_ENVIRONMENT \
244-
-e MAX_JOBS="$(nproc --ignore=2)" \
256+
-e MAX_JOBS=${MAX_JOBS} \
257+
-e MAX_JOBS_OVERRIDE \
245258
-e AWS_DEFAULT_REGION \
246259
-e PR_NUMBER \
247260
-e SHA1 \

.github/workflows/pull.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,26 @@ jobs:
449449
]}
450450
secrets: inherit
451451

452+
unstable-linux-focal-cuda12_4-py3_10-gcc9-sm89-build-xfail:
453+
# A version of the build that sets a larger number of jobs for a build. May
454+
# OOM
455+
name: unstable-linux-focal-cuda12.4-py3.10-gcc9-sm89-xfail
456+
uses: ./.github/workflows/_linux-build.yml
457+
needs: get-label-type
458+
with:
459+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
460+
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
461+
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
462+
cuda-arch-list: 8.9
463+
max-jobs: 4
464+
# Doesn't actually run tests, but need this in order to prevent the build
465+
# from being skipped
466+
test-matrix: |
467+
{ include: [
468+
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
469+
]}
470+
secrets: inherit
471+
452472
linux-focal-cuda12_4-py3_10-gcc9-sm89-test:
453473
name: linux-focal-cuda12.4-py3.10-gcc9-sm89
454474
uses: ./.github/workflows/_linux-test.yml

0 commit comments

Comments
 (0)
0