pytorch
diff --git a/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/executorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/requirements-docs.txt
Lines changed: 8 additions & 2 deletions b/‎.ci/docker/requirements-docs.txt
Lines changed: 8 additions & 2 deletions
diff --git a/‎.ci/pytorch/macos-test.sh
Lines changed: 19 additions & 17 deletions b/‎.ci/pytorch/macos-test.sh
Lines changed: 19 additions & 17 deletions
diff --git a/‎.ci/pytorch/python_doc_push_script.sh
Lines changed: 0 additions & 6 deletions b/‎.ci/pytorch/python_doc_push_script.sh
Lines changed: 0 additions & 6 deletions
diff --git a/‎.github/actions/binary-docker-build/action.yml
Lines changed: 68 additions & 0 deletions b/‎.github/actions/binary-docker-build/action.yml
Lines changed: 68 additions & 0 deletions
diff --git a/‎.github/workflows/inductor-perf-test-nightly-macos.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/inductor-perf-test-nightly-macos.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 11 additions & 2 deletions b/‎aten/src/ATen/core/CachingHostAllocator.h
Lines changed: 11 additions & 2 deletions
diff --git a/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 1 addition & 11 deletions b/‎aten/src/ATen/cuda/CachingHostAllocator.cpp
Lines changed: 1 addition & 11 deletions
diff --git a/‎aten/src/ATen/native/layer_norm.cpp
Lines changed: 2 additions & 2 deletions b/‎aten/src/ATen/native/layer_norm.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/kernels/UnaryKernel.metal
Lines changed: 20 additions & 0 deletions b/‎aten/src/ATen/native/mps/kernels/UnaryKernel.metal
Lines changed: 20 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/BitwiseOps.mm
Lines changed: 0 additions & 58 deletions b/‎aten/src/ATen/native/mps/operations/BitwiseOps.mm
Lines changed: 0 additions & 58 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/RMSNorm.h
Lines changed: 0 additions & 14 deletions b/‎aten/src/ATen/native/mps/operations/RMSNorm.h
Lines changed: 0 additions & 14 deletions
@@ -1 +1 @@
-4022ff142a5392aa5197e05f4dfe85d356f742bf
+17cbef50fd4ac8488632367a864aa01a2c0019ef
@@ -1,15 +1,20 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@c49afc2aff734d40813b0ca182bb49b611d7a30c#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
+
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6
 
+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@@ -46,5 +51,6 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-panels==0.4.1
+sphinx-design==0.4.0
+sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
@@ -221,25 +221,27 @@ test_torchbench_smoketest() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
-  local backend=eager
   local dtype=notset
   local device=mps
-
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-
-  echo "Setup complete, launching torchbench training performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-
10000
      --performance --only "$model" --backend "$backend" --training --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  done
-
-  echo "Launching torchbench inference performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152)
+
+  for backend in eager inductor; do
+    touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+    touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+
+    echo "Launching torchbench training performance run for backend ${backend}"
+    for model in "${models[@]}"; do
+      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+        --performance --only "$model" --backend "$backend" --training --devices "$device" \
+        --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
+    done
+
+    echo "Launching torchbench inference performance run for backend ${backend}"
+    for model in "${models[@]}"; do
+      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+        --performance --only "$model" --backend "$backend" --inference --devices "$device" \
+        --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
+    done
   done
 
   echo "Pytorch benchmark on mps device completed"
 
@@ -119,12 +119,6 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"
 
-# Prevent Google from indexing $install_path/_modules. This folder contains
-# generated source files.
-# NB: the following only works on gnu sed. The sed shipped with mac os is different.
-# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
-find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
-
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
 
@@ -0,0 +1,68 @@
+name: Binary docker build
+
+description: Build docker image for binary builds
+
+inputs:
+  docker-image-name:
+    description: Docker image name for PR builds
+    required: true
+  docker-build-dir:
+    description: Location of the build.sh relative to .ci/docker
+    required: true
+  custom-tag-prefix:
+    description: Custom tag prefix for the docker image
+    required: false
+  DOCKER_TOKEN:
+    description: Docker token for authentication
+    required: true
+  DOCKER_ID:
+    description: Docker ID for authentication
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Checkout PyTorch
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+        docker-build-dir: .ci/docker
+        custom-tag-pre
10000
fix: ${{ inputs.custom-tag-prefix }}
+        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
+        always-rebuild: true
+        push: true
+
+    - name: Tag and (if WITH_PUSH) push docker image to docker.io
+      env:
+        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
+        DOCKER_ID: ${{ inputs.DOCKER_ID }}
+        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
+        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
+        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        set -euox pipefail
+        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+        GIT_BRANCH_NAME=${GITHUB_REF##*/}
+        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
+
+        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
+
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+
+        # Pretty sure Github will mask tokens and I'm not sure if it will even be
+        # printed due to pipe, but just in case
+        set +x
+        if [[ ${WITH_PUSH:-false} == "true" ]]; then
+          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+        fi
@@ -1,5 +1,4 @@
-name: perf-nightly-macos
-# Technically not an inductor test, but uses it as a template for tracking macos performance
+name: inductor-perf-nightly-macos
 
 on:
   schedule:
@@ -24,6 +23,7 @@ on:
   pull_request:
     paths:
       - .github/workflows/inductor-perf-test-nightly-macos.yml
+      - .ci/pytorch/macos-test.sh
 
 concurrency:
   group:  ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
 
@@ -620,12 +620,17 @@ struct CachingHostAllocatorImpl {
   alignas(64) HostStatsStaged stats_;
 };
 
-template <typename T>
+template <typename T, c10::DeleterFnPtr deleteFunc>
 struct CachingHostAllocatorInterface : public at::Allocator {
   CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
 
   at::DataPtr allocate(size_t size) override {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        deleteFunc, // Use the template parameter deleter function
+        at::DeviceType::CPU};
   }
 
   void free(void* ctx) {
@@ -661,5 +666,9 @@ struct CachingHostAllocatorInterface : public at::Allocator {
   std::unique_ptr<T> impl_;
 };
 
+#define DECLARE_HOST_ALLOCATOR(name, impl, deleter) \
+  struct name final                                 \
+      : public at::CachingHostAllocatorInterface<impl, deleter> {};
+
 } // namespace at
 C10_DIAGNOSTIC_POP()
@@ -251,17 +251,7 @@ struct CUDACachingHostAllocatorImpl
 
 void raw_local_deleter(void* ptr);
 
-struct CUDACachingHostAllocator final
-    : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
-  at::DataPtr allocate(size_t size) override {
-    auto ptr_and_ctx = impl_->allocate(size);
-    return {
-        ptr_and_ctx.first,
-        ptr_and_ctx.second,
-        &raw_local_deleter,
-        at::DeviceType::CPU};
-  }
-};
+DECLARE_HOST_ALLOCATOR(CUDACachingHostAllocator, CUDACachingHostAllocatorImpl, raw_local_deleter);
 
 CUDACachingHostAllocator caching_host_allocator;
 
 
@@ -16,6 +16,7 @@
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/empty_like_native.h>
 #include <ATen/ops/layer_norm_native.h>
+#include <ATen/ops/_fused_rms_norm.h>
 #include <ATen/ops/native_batch_norm.h>
 #include <ATen/ops/native_layer_norm.h>
 #include <ATen/ops/native_layer_norm_backward_native.h>
@@ -27,7 +28,6 @@
 #endif
 
 #ifdef USE_MPS
-#include <ATen/native/mps/operations/RMSNorm.h>
 #include <c10/core/GradMode.h>
 #endif
 
@@ -281,7 +281,7 @@ Tensor rms_norm_symint(
 
     if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
       auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
-      return mps::rms_norm_mps_kernel(input.contiguous(), normalized_shape, weight.contiguous(), eps_val);
+      return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
     }
   }
 #endif
 
@@ -67,9 +67,29 @@ struct sqrt_functor {
   }
 };
 
+struct bitwise_not_functor {
+  template <typename T>
+  inline enable_if_t<!is_same_v<T, bool> && is_scalar_integral_v<T>, T>
+  operator()(const T x) {
+    return ~x;
+  }
+
+  template <typename T>
+  inline enable_if_t<is_same_v<T, bool>, T> operator()(const T x) {
+    return !x;
+  }
+};
+
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
 DEFINE_UNARY_FLOATING_FUNCTOR(sinc);
 
+REGISTER_UNARY_OP(bitwise_not, int, int);
+REGISTER_UNARY_OP(bitwise_not, long, long);
+REGISTER_UNARY_OP(bitwise_not, short, short);
+REGISTER_UNARY_OP(bitwise_not, char, char);
+REGISTER_UNARY_OP(bitwise_not, uchar, uchar);
+REGISTER_UNARY_OP(bitwise_not, bool, bool);
+
 #define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
   REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0);       \
   REGISTER_UNARY_OP(exp, DTYPE1, DTYPE0);          \
 
@@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/bitwise_and_native.h>
-#include <ATen/ops/bitwise_not_native.h>
 #include <ATen/ops/bitwise_or_native.h>
 #include <ATen/ops/bitwise_xor_native.h>
 #include <ATen/ops/logical_not_native.h>
@@ -100,11 +99,6 @@ kernel void bitwise_rshift_scalar_tensor(device {0}  *out [[buffer(0)]],
   out[offset] = static_cast<{0}>(a) >> b[offset];
 }}
 
-kernel void bitwise_not(device {0}  *out [[buffer(0)]],
-                         constant {1}  *a [[buffer(1)]],
-                         uint offset [[thread_position_in_grid]]) {{
-  out[offset] = ~a[offset];
-}}
 )METAL",
                               3);
 
@@ -200,54 +194,6 @@ static void _bitwise_op_out_mps(const Tensor& self,
   return;
 }
 
-static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
-  // Handle boolean tensor using logical not
-  if (self.scalar_type() == c10::ScalarType::Bool) {
-    logical_not_out_mps(self, const_cast<Tensor&>(output_));
-    return;
-  }
-
-  Tensor output = output_;
-  bool needs_output_copy = false;
-
-  resize_output(output, self.sizes());
-  if (needsGather(output)) {
-    output = output.contiguous();
-    needs_output_copy = true;
-  }
-  if (self.dim() == 0) {
-    if (self.scalar_type() == c10::ScalarType::Byte) {
-      // Unsigned types need a special handling to keep result of operation in 0..255 output
-      output.fill_(c10::Scalar(static_cast<uint8_t>(~self.item<uint8_t>())));
-    } else {
-      output.fill_(c10::Scalar(~self.item<int64_t>()));
-    }
-    return;
-  }
-  uint32_t length = output.numel();
-  if (length == 0) {
-    return;
-  }
-  using namespace at::mps;
-  MPSStream* stream = getCurrentMPSStream();
-  auto cplState = getCPLState(output, self, self, "bitwise_not");
-  dispatch_sync(stream->queue(), ^() {
-    getMPSProfiler().beginProfileKernel(cplState, "bitwise_not", {self});
-
-    id<MTLComputeCommandEncoder> commandEncoder = stream->commandEncoder();
-
-    [commandEncoder pushDebugGroup:@"Dispatch bitwise_not kernel"];
-    [commandEncoder setComputePipelineState:cplState];
-    mtl_setArgs(commandEncoder, output, self);
-    mtl_dispatch1DJob(commandEncoder, cplState, length);
-
-    getMPSProfiler().endProfileKernel(cplState);
-  });
-  if (needs_output_copy) {
-    output_.copy_(output);
-  }
-}
-
 } // namespace mps
 namespace {
 void lshift_kernel_mps(TensorIteratorBase& iter) {
@@ -272,10 +218,6 @@ void rshift_kernel_mps(TensorIteratorBase& iter) {
   mps::_bitwise_op_out_mps(self, other, output, "xor");
 }
 
-TORCH_IMPL_FUNC(bitwise_not_out_mps)(const Tensor& self, const Tensor& output) {
-  mps::_bitwise_not_out_mps(self, output);
-}
-
 REGISTER_MPS_DISPATCH(lshift_stub, &lshift_kernel_mps)
 REGISTER_MPS_DISPATCH(rshift_stub, &rshift_kernel_mps)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-4022ff142a5392aa5197e05f4dfe85d356f742bf`
	`1`	`+17cbef50fd4ac8488632367a864aa01a2c0019ef`