8000 ENH, SIMD: Replace raw SIMD of unary float point(32-64) with NPYV - g0 · numpy/numpy@34d2672 · GitHub
[go: up one dir, main page]

Skip to content

Commit 34d2672

Browse files
committed
ENH, SIMD: Replace raw SIMD of unary float point(32-64) with NPYV - g0
- only covers sqrt, absolute, square and reciprocal - fix SIMD memory overlap check for aliasing(same ptr & stride) - unify fp/domain errors for both scalars and vectors
1 parent ade6638 commit 34d2672

File tree

7 files changed

+327
-222
lines changed

7 files changed

+327
-222
lines changed

numpy/core/code_generators/generate_umath.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -352,14 +352,14 @@ def english_upper(s):
352352
Ufunc(1, 1, None,
353353
docstrings.get('numpy.core.umath.square'),
354354
None,
355-
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
355+
TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
356356
TD(O, f='Py_square'),
357357
),
358358
'reciprocal':
359359
Ufunc(1, 1, None,
360360
docstrings.get('numpy.core.umath.reciprocal'),
361361
None,
362-
TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
362+
TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
363363
TD(O, f='Py_reciprocal'),
364364
),
365365
# This is no longer used as numpy.ones_like, however it is
@@ -389,7 +389,7 @@ def english_upper(s):
389389
Ufunc(1, 1, None,
390390
docstrings.get('numpy.core.umath.absolute'),
391391
'PyUFunc_AbsoluteTypeResolver',
392-
TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
392+
TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
393393
TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
394394
TD(O, f='PyNumber_Absolute'),
395395
),
@@ -767,7 +767,7 @@ def english_upper(s):
767767
docstrings.get('numpy.core.umath.sqrt'),
768768
None,
769769
TD('e', f='sqrt', astype={'e':'f'}),
770-
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
770+
TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
771771
TD('fdg' + cmplx, f='sqrt'),
772772
TD(P, f='sqrt'),
773773
),

numpy/core/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,7 @@ def generate_umath_c(ext, build_dir):
904904
join('src', 'umath', 'simd.inc.src'),
905905
join('src', 'umath', 'loops.h.src'),
906906
join('src', 'umath', 'loops.c.src'),
907+
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
907908
join('src', 'umath', 'matmul.h.src'),
908909
join('src', 'umath', 'matmul.c.src'),
909910
join('src', 'umath', 'clip.h.src'),

numpy/core/src/umath/loops.c.src

Lines changed: 36 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1490,26 +1490,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
14901490
*****************************************************************************
14911491
*/
14921492

1493-
/**begin repeat
1494-
* Float types
1495-
* #type = npy_float, npy_double#
1496-
* #TYPE = FLOAT, DOUBLE#
1497-
* #scalarf = npy_sqrtf, npy_sqrt#
1498-
*/
1499-
1500-
NPY_NO_EXPORT void
1501-
@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1502-
{
1503-
if (!run_unary_simd_sqrt_@TYPE@(args, dime 9E88 nsions, steps)) {
1504-
UNARY_LOOP {
1505-
const @type@ in1 = *(@type@ *)ip1;
1506-
*(@type@ *)op1 = @scalarf@(in1);
1507-
}
1508-
}
1509-
}
1510-
1511-
/**end repeat**/
1512-
15131493
/**begin repeat
15141494
* #func = rint, ceil, floor, trunc#
15151495
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -1579,53 +1559,6 @@ DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void
15791559
* #typesub = f, #
15801560
*/
15811561

1582-
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1583-
@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1584-
{
1585-
if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
1586-
UNARY_LOOP {
1587-
const @type@ in1 = *(@type@ *)ip1;
1588-
*(@type@ *)op1 = npy_sqrt@typesub@(in1);
1589-
}
1590-
}
1591-
}
1592-
1593-
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1594-
@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1595-
{
1596-
if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
1597-
UNARY_LOOP {
1598-
const @type@ in1 = *(@type@ *)ip1;
1599-
const @type@ tmp = in1 > 0 ? in1 : -in1;
1600-
/* add 0 to clear -0.0 */
1601-
*((@type@ *)op1) = tmp + 0;
1602-
}
1603-
}
1604-
npy_clear_floatstatus_barrier((char*)dimensions);
1605-
}
1606-
1607-
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1608-
@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1609-
{
1610-
if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
1611-
UNARY_LOOP {
1612-
const @type@ in1 = *(@type@ *)ip1;
1613-
*(@type@ *)op1 = in1*in1;
1614-
}
1615-
}
1616-
}
1617-
1618-
NPY_NO_EXPORT NPY_GCC_OPT_3 void
1619-
@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
1620-
{
1621-
if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
1622-
UNARY_LOOP {
1623-
const @type@ in1 = *(@type@ *)ip1;
1624-
*(@type@ *)op1 = 1.0f/in1;
1625-
}
1626-
}
1627-
}
1628-
16291562
/**begin repeat2
16301563
* #func = rint, ceil, floor, trunc#
16311564
* #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -2047,33 +1980,6 @@ NPY_NO_EXPORT void
20471980
}
20481981
}
20491982

2050-
NPY_NO_EXPORT void
2051-
@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2052-
{
2053-
char * margs[] = {args[0], args[0], args[1]};
2054-
npy_intp msteps[] = {steps[0], steps[0], steps[1]};
2055-
if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
2056-
UNARY_LOOP {
2057-
const @type@ in1 = *(@type@ *)ip1;
2058-
*((@type@ *)op1) = in1*in1;
2059-
}
2060-
}
2061-
}
2062-
2063-
NPY_NO_EXPORT void
2064-
@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2065-
{
2066-
@type@ one = 1.@c@;
2067-
char * margs[] = {(char*)&one, args[0], args[1]};
2068-
npy_intp msteps[] = {0, steps[0], steps[1]};
2069-
if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
2070-
UNARY_LOOP {
2071-
const @type@ in1 = *(@type@ *)ip1;
2072-
*((@type@ *)op1) = 1/in1;
2073-
}
2074-
}
2075-
}
2076-
20771983
NPY_NO_EXPORT void
20781984
@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
20791985
{
@@ -2091,20 +1997,6 @@ NPY_NO_EXPORT void
20911997
}
20921998
}
20931999

2094-
NPY_NO_EXPORT void
2095-
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
2096-
{
2097-
if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
2098-
UNARY_LOOP {
2099-
const @type@ in1 = *(@type@ *)ip1;
2100-
const @type@ tmp = in1 > 0 ? in1 : -in1;
2101-
/* add 0 to clear -0.0 */
2102-
*((@type@ *)op1) = tmp + 0;
2103-
}
2104-
}
2105-
npy_clear_floatstatus_barrier((char*)dimensions);
2106-
}
2107-
21082000
NPY_NO_EXPORT void
21092001
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
21102002
{
@@ -2214,6 +2106,42 @@ NPY_NO_EXPORT void
22142106

22152107
/**end repeat**/
22162108

2109+
/*
2110+
*****************************************************************************
2111+
** LONGDOUBLE LOOPS **
2112+
*****************************************************************************
2113+
*/
2114+
2115+
NPY_NO_EXPORT void
2116+
LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2117+
{
2118+
UNARY_LOOP {
2119+
const npy_longdouble in1 = *(npy_longdouble*)ip1;
2120+
*((npy_longdouble *)op1) = 1/in1;
2121+
}
2122+
}
2123+
2124+
NPY_NO_EXPORT void
2125+
LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
2126+
{
2127+
UNARY_LOOP {
2128+
const npy_longdouble in1 = *(npy_longdouble *)ip1;
2129+
const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
2130+
/* add 0 to clear -0.0 */
2131+
*((npy_longdouble *)op1) = tmp + 0;
2132+
}
2133+
npy_clear_floatstatus_barrier((char*)dimensions);
2134+
}
2135+
2136+
NPY_NO_EXPORT void
2137+
LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
2138+
{
2139+
UNARY_LOOP {
2140+
const npy_longdouble in1 = *(npy_longdouble *)ip1;
2141+
*((npy_longdouble *)op1) = in1*in1;
2142+
}
2143+
}
2144+
22172145
/*
22182146
*****************************************************************************
22192147
** HALF-FLOAT LOOPS **

numpy/core/src/umath/loops.h.src

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
#ifndef _NPY_UMATH_LOOPS_H_
77
#define _NPY_UMATH_LOOPS_H_
88

9+
#ifndef NPY_NO_EXPORT
10+
#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
11+
#endif
12+
913
#define BOOL_invert BOOL_logical_not
1014
#define BOOL_add BOOL_logical_or
1115
#define BOOL_bitwise_and BOOL_logical_and
@@ -167,32 +171,29 @@ NPY_NO_EXPORT void
167171
** FLOAT LOOPS **
168172
*****************************************************************************
169173
*/
170-
174+
#ifndef NPY_DISABLE_OPTIMIZATION
175+
#include "loops_unary_fp.dispatch.h"
176+
#endif
171177
/**begin repeat
172178
* #TYPE = FLOAT, DOUBLE#
173179
*/
174-
NPY_NO_EXPORT void
175-
@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
176-
177180
/**begin repeat1
178-
* #func = maximum, minimum#
181+
* #kind = sqrt, absolute, square, reciprocal#
179182
*/
180-
NPY_NO_EXPORT void
181-
@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
182-
183+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
184+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
183185
/**end repeat1**/
186+
/**end repeat**/
184187

185-
/**begin repeat1
186-
* #isa = avx512f, fma#
188+
/**begin repeat
189+
* #TYPE = FLOAT, DOUBLE#
187190
*/
188-
189-
/**begin repeat2
190-
* #func = sqrt, absolute, square, reciprocal#
191+
/**begin repeat1
192+
* #func = maximum, minimum#
191193
*/
192194
NPY_NO_EXPORT void
193-
@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
195+
@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
194196

195-
/**end repeat2**/
196197
/**end repeat1**/
197198
/**end repeat**/
198199

0 commit comments

Comments
 (0)
0