pytorch
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
Lines changed: 6 additions & 6 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
Lines changed: 6 additions & 6 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
Lines changed: 4 additions & 5 deletions b/‎aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
Lines changed: 4 additions & 5 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
Lines changed: 6 additions & 8 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h
Lines changed: 6 additions & 8 deletions
diff --git a/‎aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
Lines changed: 4 additions & 5 deletions b/‎aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h
Lines changed: 4 additions & 5 deletions
diff --git a/‎torch/testing/_internal/common_methods_invocations.py
Lines changed: 2 additions & 29 deletions b/‎torch/testing/_internal/common_methods_invocations.py
Lines changed: 2 additions & 29 deletions
@@ -121,7 +121,10 @@ template <> class Vectorized<c10::complex<double>> {
     return _mm256_hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
   }
   __m256d abs_() const {
-    return _mm256_sqrt_pd(abs_2_());                // abs     abs
+    auto real = _mm256_movedup_pd(values);       // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm256_permute_pd(values, 0xf);  // imag imag
+    return Sleef_hypotd4_u05(real, imag);        // abs  abs
   }
   Vectorized<c10::complex<double>> abs() const {
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
@@ -143,11 +146,8 @@ template <> class Vectorized<c10::complex<double>> {
     auto abs = abs_();
     auto zero = _mm256_setzero_pd();
     auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
-    auto abs_val = Vectorized(abs);
-
-    auto div = values / abs_val.values;       // x / abs(x)
-
-    return blendv(div, zero, mask);
+    auto div = values / abs;
+    return _mm256_blendv_pd(div, zero, mask);
   }
   __m256d real_() const {
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
 
@@ -157,7 +157,9 @@ template <> class Vectorized<c10::complex<float>> {
     return _mm256_permute_ps(ret, 0xD8);
   }
   __m256 abs_() const {
-    return _mm256_sqrt_ps(abs_2_());                // abs     abs
+    auto real = _mm256_moveldup_ps(values);   // real real
+    auto imag = _mm256_movehdup_ps(values);   // imag imag
+    return Sleef_hypotf8_u05(real, imag);     // abs  abs
   }
   Vectorized<c10::complex<float>> abs() const {
     const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
@@ -179,10 +181,7 @@ template <> class Vectorized<c10::complex<float>> {
     auto abs = abs_();
     auto zero = _mm256_setzero_ps();
     auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
-    auto abs_val = Vectorized(abs);
-
-    auto div = values / abs_val.values;       // x / abs(x)
-
+    auto div = values / abs;
     return _mm256_blendv_ps(div, zero, mask);
   }
   __m256 real_() const {
 
@@ -174,7 +174,10 @@ template <> class Vectorized<c10::complex<double>> {
     return hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
   }
   __m512d abs_() const {
-    return _mm512_sqrt_pd(abs_2_());                // abs     abs
+    auto real = _mm512_movedup_pd(values);        // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm512_permute_pd(values, 0xff);  // imag imag
+    return Sleef_hypotd8_u05(real, imag);         // abs  abs
   }
   Vectorized<c10::complex<double>> abs() const {
     const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
@@ -200,13 +203,8 @@ template <> class Vectorized<c10::complex<double>> {
     auto abs = abs_();
     auto zero = _mm512_setzero_pd();
     auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
-    auto mask_vec = _mm512_mask_set1_epi64(_mm512_castpd_si512(zero), mask,
-                                          0xFFFFFFFFFFFFFFFF);
-    auto abs_val = Vectorized(abs);
-
-    auto div = values / abs_val.values;       // x / abs(x)
-
-    return blendv(div, zero, _mm512_castsi512_pd(mask_vec));
+    auto div = values / abs;
+    return _mm512_mask_blend_pd(mask, div, zero);
   }
   __m512d real_() const {
     const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
 
@@ -680,7 +680,9 @@ template <> class Vectorized<c10::complex<float>> {
     return ret;
   }
   __m512 abs_() const {
-    return _mm512_sqrt_ps(abs_2_());                // abs     abs
+    auto real = _mm512_moveldup_ps(values);    // real real
+    auto imag = _mm512_movehdup_ps(values);    // imag imag
+    return Sleef_hypotf16_u05(real, imag);     // abs  abs
   }
   Vectorized<c10::complex<float>> abs() const {
     const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
@@ -706,10 +708,7 @@ template <> class Vectorized<c10::complex<float>> {
     auto abs = abs_();
     auto zero = _mm512_setzero_ps();
     auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
-    auto abs_val = Vectorized(abs);
-
-    auto div = values / abs_val.values;       // x / abs(x)
-
+    auto div = values / abs;
     return _mm512_mask_blend_ps(mask, div, zero);
   }
   __m512 real_() const {
 
@@ -8884,10 +8884,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                     'test_inplace_gradgrad', dtypes=(torch.cdouble,)),
                        DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestFwdGradients',
                                     'test_inplace_forward_mode_AD', dtypes=(torch.cdouble,)),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    device_type='cpu', dtypes=[torch.cfloat]),
                        DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestSparseUnaryUfuncs",
                                     "test_inplace", dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
                        # Reference: https://github.com/pytorch/pytorch/issues/49224
@@ -13185,11 +13181,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                 toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1.3e-04)}), 'TestUnaryUfuncs',), ],
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
-                         dtypes=(torch.int, torch.int8)),
-            # pytorch computes (0+nanj), numpy computes (-5e-18-1j) for input (-501.-1.0000e+20j)
-            DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
-                         "test_reference_numerics_large", dtypes=(torch.complex64,), device_type='cpu',
-                         active_if=not IS_MACOS and not IS_WINDOWS),),
+                         dtypes=(torch.int, torch.int8)),),
     ),
     UnaryUfuncInfo(
         'nn.functional.tanhshrink',
@@ -14078,14 +14070,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        # Reference: https://github.com/pytorch/pytorch/issues/41245
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
                                     dtypes=[torch.bfloat16, torch.float16, torch.float32, torch.float64]),
-                       # Reference: https://github.com/pytorch/pytorch/issues/53958
-                       # Test fails in comparison on Nan as the `equal_nan` is True for
-                       # comparing the CPU tensors.
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
-                                    device_type='cpu', dtypes=[torch.complex64, torch.complex128]),
-                       # Reference: https://github.com/pytorch/pytorch/issues/48486
-                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
-                                    device_type='cpu', dtypes=[torch.complex64]),
                        DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
                                     'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
                    )),
@@ -18077,18 +18061,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     #
     ElementwiseUnaryPythonRefInfo(
         "_refs.abs",
-        torch_opinfo_name="abs",
-        skips=(
-            # Reference result was farther (0.0) from the precise computation
-            # than the torch result was (nan)!
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
-                         dtypes=(torch.chalf,), device_type='cpu', active_if=not (IS_MACOS or IS_WINDOWS)),
-            # Reference result was farther (0.0) from the precise computation
-            # than the torch result was (nan)!
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
-                         dtypes=(torch.cha
58BD
lf,), device_type='cpu', active_if=not (IS_MACOS or IS_WINDOWS)),
-        )
-    ),
+        torch_opinfo_name="abs"),
     ElementwiseUnaryPythonRefInfo(
         "_refs.acos",
         torch_opinfo_name="acos",