pytorch
diff --git a/‎.ci/docker/build.sh
Lines changed: 15 additions & 0 deletions b/‎.ci/docker/build.sh
Lines changed: 15 additions & 0 deletions
diff --git a/‎.ci/docker/common/install_acl.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/common/install_acl.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/docker-builds.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/docker-builds.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/inductor.yml
Lines changed: 21 additions & 0 deletions b/‎.github/workflows/inductor.yml
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/upload_test_stats_intermediate.yml
Lines changed: 43 additions & 0 deletions b/‎.github/workflows/upload_test_stats_intermediate.yml
Lines changed: 43 additions & 0 deletions
diff --git a/‎.gitmodules
Lines changed: 4 additions & 12 deletions b/‎.gitmodules
Lines changed: 4 additions & 12 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions
diff --git a/‎BUILD.bazel
Lines changed: 1 addition & 1 deletion b/‎BUILD.bazel
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 6 deletions b/‎CMakeLists.txt
Lines changed: 3 additions & 6 deletions
@@ -149,6 +149,21 @@ case "$image" in
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+  pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.1.1
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
   pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
     CUDA_VERSION=11.8.0
     CUDNN_VERSION=8
 
@@ -1,6 +1,6 @@
 set -euo pipefail
 
-readonly version=v23.08
+readonly version=v24.04
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary
 
 
@@ -42,6 +42,7 @@ jobs:
           pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
           pytorch-linux-focal-py3.8-clang10,
           pytorch-linux-focal-py3.11-clang10,
 
@@ -107,6 +107,27 @@ jobs:
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
 
+  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+
   linux-jammy-cpu-py3_8-gcc11-inductor-build:
     name: linux-jammy-cpu-py3.8-gcc11-inductor
     uses: ./.github/workflows/_linux-build.yml
 
@@ -194,6 +194,7 @@ jobs:
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
         ]}
 
   linux-focal-rocm6_1-py3_8-test:
@@ -209,4 +210,4 @@ jobs:
       build-environment: linux-focal-rocm6.1-py3.8
       docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
@@ -0,0 +1,43 @@
+name: Upload test stats intermediate
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflow_id:
+        description: workflow_id of the run
+        required: true
+      workflow_run_attempt:
+        description: workflow_run_attempt of the run
+        required: true
+
+jobs:
+  intermediate_upload_test_stats:
+    name: Intermediate upload test stats for ${{ inputs.workflow_id }}
+    runs-on: ubuntu-22.04
+    environment: upload-stats
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - run: |
+          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+
+      - name: Upload test stats
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
+        run: |
+          python3 -m tools.stats.upload_test_stats_intermediate \
+            --workflow-run-id "${WORKFLOW_RUN_ID}" \
+            --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
@@ -2,10 +2,6 @@
     ignore = dirty
     path = third_party/pybind11
     url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cub"]
-    ignore = dirty
-    path = third_party/cub
-    url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
     ignore = dirty
     path = third_party/eigen
@@ -50,10 +46,6 @@
     ignore = dirty
     path = third_party/psimd
     url = https://github.com/Maratyszcza/psimd.git
-[submodule "third_party/zstd"]
-    ignore = dirty
-    path = third_party/zstd
-    url = https://github.com/facebook/zstd.git
 [submodule "third_party/cpuinfo"]
     ignore = dirty
     path = third_party/cpuinfo
@@ -66,10 +58,6 @@
     ignore = dirty
     path = third_party/onnx
     url = https://github.com/onnx/onnx.git
-[submodule "third_party/onnx-tensorrt"]
-    ignore = dirty
-    path = third_party/onnx-tensorrt
-    url = https://github.com/onnx/onnx-tensorrt
 [submodule "third_party/sleef"]
     ignore = dirty
     path = third_party/sleef
@@ -152,3 +140,7 @@
 [submodule "third_party/opentelemetry-cpp"]
 	path = third_party/opentelemetry-cpp
 	url = https://github.com/open-telemetry/opentelemetry-cpp.git
+[submodule "third_party/cpp-httplib"]
+	path = third_party/cpp-httplib
+	url = https://github.com/yhirose/cpp-httplib.git
+	branch = v0.15.3
@@ -1929,8 +1929,6 @@ exclude_patterns = [
     'torch/utils/_mode_utils.py',
     'torch/utils/_python_dispatch.py',
     'torch/utils/_stats.py',
-    'torch/utils/_sympy/__init__.py',
-    'torch/utils/_sympy/functions.py',
     'torch/utils/_traceback.py',
     'torch/utils/_zip.py',
     'torch/utils/backcompat/__init__.py',
 
@@ -772,7 +772,7 @@ cc_library(
         [
             "torch/*.h",
             "torch/csrc/**/*.h",
-            "torch/csrc/distributed/c10d/*.hpp",
+            "torch/csrc/distributed/c10d/**/*.hpp",
             "torch/lib/libshm/*.h",
         ],
         exclude = [
 
@@ -265,7 +265,6 @@ option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
     "Use system Eigen instead of the one under third_party" OFF)
-option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
 cmake_dependent_option(
     USE_VALGRIND "Use Valgrind. Only available on Linux." ON
     "LINUX" OFF)
@@ -279,11 +278,13 @@ endif()
 option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
+cmake_dependent_option(
+  USE_LITE_AOTI "Include AOTI sources" OFF
+  "BUILD_LITE_INTERPRETER" OFF)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.
 option(USE_XNNPACK "Use XNNPACK" ON)
-option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
@@ -413,7 +414,6 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
 option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
 option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
 option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
-option(USE_SYSTEM_ZSTD "Use system-provided zstd." OFF)
 option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
 if(USE_SYSTEM_LIBS)
   set(USE_SYSTEM_CPUINFO ON)
@@ -435,9 +435,6 @@ if(USE_SYSTEM_LIBS)
   if(USE_TBB)
     set(USE_SYSTEM_TBB ON)
   endif()
-  if(USE_ZSTD)
-    set(USE_SYSTEM_ZSTD ON)
-  endif()
 endif()
 
 # Used when building Caffe2 through setup.py
Original file line number	Diff line number	Diff line change
`@@ -772,7 +772,7 @@ cc_library(`
`772`	`772`	`[`
`773`	`773`	`"torch/*.h",`
`774`	`774`	`"torch/csrc/*/.h",`
`775`		`- "torch/csrc/distributed/c10d/*.hpp",`
	`775`	`+ "torch/csrc/distributed/c10d/*/.hpp",`
`776`	`776`	`"torch/lib/libshm/*.h",`
`777`	`777`	`],`
`778`	`778`	`exclude = [`