pytorch
diff --git a/‎aten/src/ATen/native/BinaryOps.cpp
Lines changed: 27 additions & 18 deletions b/‎aten/src/ATen/native/BinaryOps.cpp
Lines changed: 27 additions & 18 deletions
diff --git a/‎aten/src/ATen/native/BinaryOps.h
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/native/BinaryOps.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/native/Math.h
Lines changed: 32 additions & 0 deletions b/‎aten/src/ATen/native/Math.h
Lines changed: 32 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Lines changed: 9 additions & 0 deletions b/‎aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
Lines changed: 9 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/Math.cuh
Lines changed: 34 additions & 0 deletions b/‎aten/src/ATen/native/cuda/Math.cuh
Lines changed: 34 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/cuda/hermite_polynomial_he.cu
Lines changed: 33 additions & 0 deletions b/‎aten/src/ATen/native/cuda/hermite_polynomial_he.cu
Lines changed: 33 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 13 additions & 0 deletions b/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 13 additions & 0 deletions
@@ -102,6 +102,10 @@ TORCH_META_FUNC(special_hermite_polynomial_h) (const Tensor& self, const Tensor&
   build_borrowing_binary_float_op(maybe_get_output(), self, n);
 }
 
+TORCH_META_FUNC(special_hermite_polynomial_he) (const Tensor& self, const Tensor& n) {
+  build_borrowing_binary_float_op(maybe_get_output(), self, n);
+}
+
 TORCH_META_FUNC2(copysign, Tensor) (
   const Tensor& self, const Tensor& other
 ) {
@@ -291,6 +295,7 @@ DEFINE_DISPATCH(zeta_stub);
 DEFINE_DISPATCH(chebyshev_polynomial_t_stub);
 DEFINE_DISPATCH(chebyshev_polynomial_u_stub);
 DEFINE_DISPATCH(hermite_polynomial_h_stub);
+DEFINE_DISPATCH(hermite_polynomial_he_stub);
 
 TORCH_IMPL_FUNC(sub_out) (
   const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& result
@@ -349,6 +354,10 @@ TORCH_IMPL_FUNC(special_hermite_polynomial_h_out) (const Tensor& self, const Ten
   hermite_polynomial_h_stub(device_type(), *this);
 }
 
+TORCH_IMPL_FUNC(special_hermite_polynomial_he_out) (const Tensor& self, const Tensor& n, const Tensor& result) {
+  hermite_polynomial_he_stub(device_type(), *this);
+}
+
 TORCH_IMPL_FUNC(tanh_backward_out) (const Tensor& grad_output, const Tensor& output, const Tensor& result) {
   tanh_backward_stub(device_type(), *this);
 }
@@ -457,6 +466,22 @@ Tensor& special_hermite_polynomial_h_out(const Tensor& self, const Scalar& n, Te
   return at::special_hermite_polynomial_h_out(result, self, wrapped_scalar_tensor(n));
 }
 
+Tensor special_hermite_polynomial_he(const Scalar& x, const Tensor& n) {
+  return at::special_hermite_polynomial_he(wrapped_scalar_tensor(x), n);
+}
+
+Tensor special_hermite_polynomial_he(const Tensor& x, const Scalar& n) {
+  return at::special_hermite_polynomial_he(x, wrapped_scalar_tensor(n));
+}
+
+Tensor& special_hermite_polynomial_he_out(const Scalar& self, const Tensor& n, Tensor& result) {
+  return at::special_hermite_polynomial_he_out(result, wrapped_scalar_tensor(self), n);
+}
+
+Tensor& special_hermite_polynomial_he_out(const Tensor& self, const Scalar& n, Tensor& result) {
+  return at::special_hermite_polynomial_he_out(result, self, wrapped_scalar_tensor(n));
+}
+
 Tensor& special_gammainc_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return at::igamma_out(result, self, other);
 }
@@ -649,34 +674,18 @@ Tensor& true_divide_(Tensor& self, const Scalar& divisor) {
 }
 
 Tensor& floor_divide_out(const Tensor& self, const Tensor& other, Tensor& result) {
-  TORCH_WARN_ONCE(
-    "floor_divide is deprecated, and will be removed in a future version of pytorch. "
-    "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
-    "This results in incorrect rounding for negative values.\n"
-    "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), "
-    "or for actual floor division, use torch.div(a, b, rounding_mode='floor')."
-  );
-  // FIXME: Not actually doing floor division (#43874)
   auto iter = TensorIterator::binary_op(result, self, other);
-  div_trunc_stub(iter.device_type(), iter);
+  div_floor_stub(iter.device_type(), iter);
   if (!result.defined()) {
     result = iter.output();
   }
   return result;
 }
 
 Tensor floor_divide(const Tensor& self, const Tensor& other) {
-  TORCH_WARN_ONCE(
-    "floor_divide is deprecated, and will be removed in a future version of pytorch. "
-    "It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). "
-    "This results in incorrect rounding for negative values.\n"
-    "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), "
-    "or for actual floor division, use torch.div(a, b, rounding_mode='floor')."
-  );
-  // FIXME: Not actually doing floor division (#43874)
   Tensor result;
   auto iter = TensorIterator::binary_op(result, self, other);
-  div_trunc_stub(iter.device_type(), iter);
+  div_floor_stub(iter.device_type(), iter);
   return iter.output();
 }
 
 
@@ -104,5 +104,6 @@ DECLARE_DISPATCH(structured_binary_fn, zeta_stub);
 DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_t_stub);
 DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_u_stub);
 DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_h_stub);
+DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_he_stub);
 
 }} // namespace at::native
@@ -2301,4 +2301,36 @@ static inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) {
     return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
 } // hermite_polynomial_h_forward(T x, T n)
 
+template<typename T>
+static inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x;
+    }
+
+    T p = T(1.0);
+    T q = x;
+    T r;
+
+    for (int64_t k = 1; k < n; k++) {
+        r = x * q - k * p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // hermite_polynomial_he_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, T n) {
+    return hermite_polynomial_he_forward(x, static_cast<std::int64_t>(n));
+} // hermite_polynomial_he_forward(T x, T n)
+
 C10_CLANG_DIAGNOSTIC_POP()
@@ -1134,6 +1134,14 @@ void hermite_polynomial_h_kernel(TensorIteratorBase& iterator) {
     });
 } // hermite_polynomial_h_kernel(TensorIteratorBase& iterator)
 
+void hermite_polynomial_he_kernel(TensorIteratorBase& iterator) {
+    AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_he_cpu", [&]() {
+        cpu_kernel(iterator, [](scalar_t x, scalar_t n) -> scalar_t {
+            return hermite_polynomial_he_forward(x, n);
+        });
+    });
+} // hermite_polynomial_he_kernel
+
 } // namespace
 
 REGISTER_DISPATCH(add_clamp_stub, &add_clamp_kernel);
@@ -1184,6 +1192,7 @@ REGISTER_DISPATCH(zeta_stub, &zeta_kernel);
 REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_kernel);
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_kernel);
 REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_kernel);
+REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_kernel);
 
 } // namespace native
 } // namespace at
@@ -1395,6 +1395,40 @@ const auto hermite_polynomial_h_string = jiterator_stringify(
     } // hermite_polynomial_h_forward(T x, T n)
 ); // hermite_polynomial_h_string
 
+const auto hermite_polynomial_he_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = x * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_he_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, T n) {
+        return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_he_forward(T x, T n)
+); // hermite_polynomial_he_string
+
 #else // !AT_USE_JITERATOR() -- kernels must be precompiled
 
 template <typename scalar_t>
 
@@ -0,0 +1,33 @@
+#define TORCH_ASSERT_NO_OPERATORS
+
+#include <ATen/Dispatch.h>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/Math.h>
+#include <ATen/native/cuda/Math.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+
+namespace at {
+    namespace native {
+        namespace {
+            const char hermite_polynomial_he_name[] = "hermite_polynomial_he_forward";
+
+            void hermite_polynomial_he_kernel_cuda(TensorIteratorBase& iterator) {
+#if AT_USE_JITERATOR()
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_he_cuda", [&]() {
+                    opmath_jitted_gpu_kernel_with_scalars<hermite_polynomial_he_name, scalar_t, scalar_t>(iterator, hermite_polynomial_he_string);
+                });
+#else
+                AT_DISPATCH_FLOATING_TYPES(iterator.common_dtype(), "hermite_polynomial_he_cuda", [&]() {
+                    gpu_kernel_with_scalars(iterator, []GPU_LAMBDA(scalar_t x, scalar_t n) -> scalar_t {
+                        return hermite_polynomial_he_forward<scalar_t, true>(x, n);
+                    });
+                });
+#endif
+            } // hermite_polynomial_he_kernel_cuda
+        } // namespace (anonymous)
+
+        REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_kernel_cuda);
+    } // namespace native
+} // namespace at
@@ -74,9 +74,22 @@ class Placeholder {
     return _value == nullptr;
   }
 
+  void allocateViewTensor(const at::Tensor& src)
+  {
+    assert (!_viewOutput.numel());
+    _viewOutput = at::native::empty_mps(
+                  src.sizes(),
+                  src.scalar_type(),
+                  c10::nullopt,
+                  kMPS,
+                  c10::nullopt,
+                  c10::nullopt);
+  }
+
  private:
   MPSGraphTensor* _placeholder;
   MPSGraphTensorData* _value;
+  Tensor _viewOutput;
 };
 
 void resize_tensor(Tensor* output);