[CI] Build sm89 with more procs experiment (#147487)

clee2000 · pytorchmergebot · commit 995b125cdd4a · 2025-02-21T22:07:00.000Z
Add a build that uses 4 out of the 8 processes available on a linux.2xlarge/c5.2xlarge. Currently it's set to 2 because it would oom, but I'm curious as to how often people's builds oom. I can't test this on my own because of caching, so it has to run on pull request This might result in a failing job on may people's PRs and I'm not sure how to get around it. I named it stable to make it automatically get sorted into the stable group for Dr. CI but it'll still show up Pull Request resolved: #147487 Approved by: https://github.com/huydhn
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -191,7 +191,7 @@ fi
 
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
   echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
   echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
   export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@@ -377,8 +377,10 @@ else
     # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
     # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
     # 16 CPUs
-    MAX_JOBS=$(nproc --ignore=4)
-    export MAX_JOBS
+    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+      MAX_JOBS=$(nproc --ignore=4)
+      export MAX_JOBS
+    fi
 
     # NB: Install outside of source directory (at the same level as the root
     # pytorch folder) so that it doesn't get cleaned away prior to docker push.
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
@@ -76,6 +76,11 @@ on:
         required: false
         type: boolean
         default: false
+      max-jobs:
+        description: |
+          Overwrite the number of jobs to use for the build
+        required: false
+        type: string
 
     secrets:
       HUGGING_FACE_HUB_TOKEN:
@@ -211,6 +216,7 @@ jobs:
           HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
           USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
         run: |
           START_TIME=$(date +%s)
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@@ -230,6 +236,12 @@ jobs:
             DOCKER_SHELL_CMD=
           fi
 
+          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
+            MAX_JOBS="$(nproc --ignore=2)"
+          else
+            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
+          fi
+
           # Leaving 1GB for the runner and other things
           TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
           # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@@ -241,7 +253,8 @@ jobs:
           # shellcheck disable=SC2086
           container_name=$(docker run \
             -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e MAX_JOBS=${MAX_JOBS} \
+            -e MAX_JOBS_OVERRIDE \
             -e AWS_DEFAULT_REGION \
             -e PR_NUMBER \
             -e SHA1 \
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -449,6 +449,26 @@ jobs:
         ]}
     secrets: inherit
 
+  unstable-linux-focal-cuda12_4-py3_10-gcc9-sm89-build-xfail:
+    # A version of the build that sets a larger number of jobs for a build.  May
+    # OOM
+    name: unstable-linux-focal-cuda12.4-py3.10-gcc9-sm89-xfail
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm89
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      cuda-arch-list: 8.9
+      max-jobs: 4
+      # Doesn't actually run tests, but need this in order to prevent the build
+      # from being skipped
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
   linux-focal-cuda12_4-py3_10-gcc9-sm89-test:
     name: linux-focal-cuda12.4-py3.10-gcc9-sm89
     uses: ./.github/workflows/_linux-test.yml