8000 Merge pull request #18766 from ganesh-k13/enh_simd_signed_division · numpy/numpy@7de0fa9 · GitHub
[go: up one dir, main page]

Skip to content

Commit 7de0fa9

Browse files
authored
Merge pull request #18766 from ganesh-k13/enh_simd_signed_division
ENH, SIMD: Replace libdivide functions of signed integer division with universal intrinsics
2 parents 5f3c318 + f74f500 commit 7de0fa9

File tree

5 files changed

+161
-105
lines changed

5 files changed

+161
-105
lines changed

numpy/core/code_generators/generate_umath.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ def english_upper(s):
328328
docstrings.get('numpy.core.umath.floor_divide'),
329329
'PyUFunc_DivisionTypeResolver',
330330
TD(ints, cfunc_alias='divide',
331-
dispatch=[('loops_arithmetic', 'BHILQ')]),
331+
dispatch=[('loops_arithmetic', 'bBhHiIlLqQ')]),
332332
TD(flts + cmplx),
333333
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
334334
TypeDescription('m', FullTypeDescr, 'md', 'm'),

numpy/core/src/common/simd/intdiv.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -368,18 +368,18 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
368368
{
369369
npy_int32 d1 = abs(d);
370370
npy_int32 sh, m;
371-
if (d1 > 1) {
371+
// Handel abs overflow
372+
if ((npy_uint32)d == 0x80000000U) {
373+
m = 0x80000001;
374+
sh = 30;
375+
}
376+
else if (d1 > 1) {
372377
sh = npyv__bitscan_revnz_u32(d1 - 1); // ceil(log2(abs(d))) - 1
373378
m = (1ULL << (32 + sh)) / d1 + 1; // multiplier
374379
}
375380
else if (d1 == 1) {
376381
sh = 0; m = 1;
377382
}
378-
// fix abs overflow
379-
else if (d == (1 << 31)) {
380-
m = d + 1;
381-
sh = 30;
382-
}
383383
else {
384384
// raise arithmetic exception for d == 0
385385
sh = m = 1 / ((npy_int32 volatile *)&d)[0]; // LCOV_EXCL_LINE
@@ -445,18 +445,18 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
445445
#else
446446
npy_int64 d1 = llabs(d);
447447
npy_int64 sh, m;
448-
if (d1 > 1) {
448+
// Handel abs overflow
449+
if ((npy_uint64)d == 0x8000000000000000ULL) {
450+
m = 0x8000000000000001LL;
451+
sh = 62;
452+
}
453+
else if (d1 > 1) {
449454
sh = npyv__bitscan_revnz_u64(d1 - 1); // ceil(log2(abs(d))) - 1
450455
m = npyv__divh128_u64(1ULL << sh, d1) + 1; // multiplier
451456
}
452457
else if (d1 == 1) {
453458
sh = 0; m = 1;
454459
}
455-
// fix abs overflow
456-
else if (d == (1LL << 63)) {
457-
m = d + 1;
458-
sh = 62;
459-
}
460460
else {
461461
// raise arithmetic exception for d == 0
462462
sh = m = 1 / ((npy_int64 volatile *)&d)[0]; // LCOV_EXCL_LINE

numpy/core/src/umath/loops.c.src

Lines changed: 0 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -843,92 +843,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
843843
UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
844844 9E7A
}
845845

846-
/* Libdivide only supports 32 and 64 bit types
847-
* We try to pick the best possible one */
848-
#if NPY_BITSOF_@TYPE@ <= 32
849-
#define libdivide_@type@_t libdivide_s32_t
850-
#define libdivide_@type@_gen libdivide_s32_gen
851-
#define libdivide_@type@_do libdivide_s32_do
852-
#else
853-
#define libdivide_@type@_t libdivide_s64_t
854-
#define libdivide_@type@_gen libdivide_s64_gen
855-
#define libdivide_@type@_do libdivide_s64_do
856-
#endif
857-
858-
NPY_NO_EXPORT void
859-
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
860-
{
861-
BINARY_DEFS
862-
863-
/* When the divisor is a constant, use libdivide for faster division */
864-
if (steps[1] == 0) {
865-
/* In case of empty array, just return */
866-
if (n == 0) {
867-
return;
868-
}
869-
870-
const @type@ in2 = *(@type@ *)ip2;
871-
872-
/* If divisor is 0, we need not compute anything */
873-
if (in2 == 0) {
874-
npy_set_floatstatus_divbyzero();
875-
BINARY_LOOP_SLIDING {
876-
*((@type@ *)op1) = 0;
877-
}
878-
}
879-
else {
880-
struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
881-
BINARY_LOOP_SLIDING {
882-
const @type@ in1 = *(@type@ *)ip1;
883-
/*
884-
* FIXME: On x86 at least, dividing the smallest representable integer
885-
* by -1 causes a SIFGPE (division overflow). We treat this case here
886-
* (to avoid a SIGFPE crash at python level), but a good solution would
887-
* be to treat integer division problems separately from FPU exceptions
888-
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
889-
*/
890-
if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
891-
npy_set_floatstatus_divbyzero();
892-
*((@type@ *)op1) = 0;
893-
}
894-
else {
895-
*((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);
896-
897-
/* Negative quotients needs to be rounded down */
898-
if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
899-
*((@type@ *)op1) = *((@type@ *)op1) - 1;
900-
}
901-
}
902-
}
903-
}
904-
}
905-
else {
906-
BINARY_LOOP_SLIDING {
907-
const @type@ in1 = *(@type@ *)ip1;
908-
const @type@ in2 = *(@type@ *)ip2;
909-
/*
910-
* FIXME: On x86 at least, dividing the smallest representable integer
911-
* by -1 causes a SIFGPE (division overflow). We treat this case here
912-
* (to avoid a SIGFPE crash at python level), but a good solution would
913-
* be to treat integer division problems separately from FPU exceptions
914-
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
915-
*/
916-
if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
917-
npy_set_floatstatus_divbyzero();
918-
*((@type@ *)op1) = 0;
919-
}
920-
else {
921-
*((@type@ *)op1) = in1/in2;
922-
923-
/* Negative quotients needs to be rounded down */
924-
if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
925-
*((@type@ *)op1) = *((@type@ *)op1) - 1;
926-
}
927-
}
928-
}
929-
}
930-
}
931-
932846
NPY_NO_EXPORT void
933847
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
934848
{

numpy/core/src/umath/loops.h.src

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
5858
#endif
5959

6060
/**begin repeat
61-
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
61+
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
62+
BYTE, SHORT, INT, LONG, LONGLONG#
6263
*/
6364
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
6465
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
@@ -151,9 +152,6 @@ NPY_NO_EXPORT void
151152
NPY_NO_EXPORT void
152153
@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
153154

154-
NPY_NO_EXPORT void
155-
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
156-
157155
NPY_NO_EXPORT void
158156
@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
159157

numpy/core/src/umath/loops_arithmetic.dispatch.c.src

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,91 @@
2020
//###############################################################################
2121
/********************************************************************************
2222
** Defining the SIMD kernels
23+
*
24+
* Floor division of signed is based on T. Granlund and P. L. Montgomery
25+
* “Division by invariant integers using multiplication(see [Figure 6.1]
26+
* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
27+
* For details on TRUNC division see simd/intdiv.h for more clarification
28+
***********************************************************************************
29+
** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF
30+
***********************************************************************************
31+
* For q = FLOOR(a/d), all sword:
32+
* sword −dsign = SRL(d, N − 1);
33+
* uword −nsign = (n < −dsign);
34+
* uword −qsign = EOR(−nsign, −dsign);
35+
* q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
2336
********************************************************************************/
37+
2438
#if NPY_SIMD
2539
/**begin repeat
26-
* #sfx = u8, u16, u32, u64#
40+
* Signed types
41+
* #sfx = s8, s16, s32, s64#
42+
* #len = 8, 16, 32, 64#
43+
*/
44+
static NPY_INLINE void
45+
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
46+
{
47+
npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
48+
npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
49+
npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
50+
const int vstep = npyv_nlanes_@sfx@;
51+
const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
52+
53+
if (scalar == -1) {
54+
npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
55+
npyv_@sfx@ vzero = npyv_zero_@sfx@();
56+
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
57+
npyv_@sfx@ a = npyv_load_@sfx@(src);
58+
npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
59+
noverflow = npyv_and_b@len@(noverflow, gt_min);
60+
npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
61+
npyv_store_@sfx@(dst, neg);
62+
}
63+
64+
int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
65+
for (; len > 0; --len, ++src, ++dst) {
66+
npyv_lanetype_@sfx@ a = *src;
67+
if (a == NPY_MIN_INT@len@) {
68+
raise_err = 1;
69+
*dst = 0;
70+
} else {
71+
*dst = -a;
72+
}
73+
}
74+
if (raise_err) {
75+
npy_set_floatstatus_divbyzero();
76+
}
77+
} else {
78+
for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
79+
npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0);
80+
npyv_@sfx@ a = npyv_load_@sfx@(src);
81+
npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
82+
nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
83+
npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
84+
npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d);
85+
npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
86+
npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf);
87+
npyv_store_@sfx@(dst, floor);
88+
}
89+
90+
for (; len > 0; --len, ++src, ++dst) {
91+
const npyv_lanetype_@sfx@ a = *src;
92+
npyv_lanetype_@sfx@ r = a / scalar;
93+
// Negative quotients needs to be rounded down
94+
if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
95+
r--;
96+
}
97+
*dst = r;
98+
}
99+
}
100+
npyv_cleanup();
101+
}
102+
/**end repeat**/
103+
104+
/**begin repeat
105+
* Unsigned types
106+
* #sfx = u8, u16, u32, u64#
107+
* #len = 8, 16, 32, 64#
27108
*/
28109
static NPY_INLINE void
29110
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
@@ -44,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
44125
const npyv_lanetype_@sfx@ a = *src;
45126
*dst = a / scalar;
46127
}
47-
48128
npyv_cleanup();
49129
}
50130
/**end repeat**/
@@ -54,6 +134,70 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
54134
** Defining ufunc inner functions
55135
********************************************************************************/
56136

137+
/**begin repeat
138+
* Signed types
139+
* #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
140+
* #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
141+
*/
142+
#undef TO_SIMD_SFX
143+
#if 0
144+
/**begin repeat1
145+
* #len = 8, 16, 32, 64#
146+
*/
147+
#elif NPY_BITSOF_@TYPE@ == @len@
148+
#define TO_SIMD_SFX(X) X##_s@len@
149+
/**end repeat1**/
150+
#endif
151+
152+
#if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
153+
#undef TO_SIMD_SFX
154+
#endif
155+
156+
NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
157+
{
158+
/*
159+
* FIXME: On x86 at least, dividing the smallest representable integer
160+
* by -1 causes a SIFGPE (division overflow). We treat this case here
161+
* (to avoid a SIGFPE crash at python level), but a good solution would
162+
* be to treat integer division problems separately from FPU exceptions
163+
* (i.e. a different approach than npy_set_floatstatus_divbyzero()).
164+
*/
165+
if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
166+
npy_set_floatstatus_divbyzero();
167+
return 0;
168+
}
169+
@type@ r = n / d;
170+
// Negative quotients needs to be rounded down
171+
if (((n > 0) != (d > 0)) && ((r * d) != n)) {
172+
r--;
173+
}
174+
return r;
175+
}
176+
177+
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
178+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
179+
{
180+
if (IS_BINARY_REDUCE) {
181+
BINARY_REDUCE_LOOP(@type@) {
182+
io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
183+
}
184+
*((@type@ *)iop1) = io1;
185+
}
186+
#if NPY_SIMD && defined(TO_SIMD_SFX)
187+
// for contiguous block of memory, divisor is a scalar and not 0
188+
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
189+
(*(@type@ *)args[1]) != 0) {
190+
TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
191+
}
192+
#endif
193+
else {
194+
BINARY_LOOP {
195+
*((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
196+
}
197+
}
198+
}
199+
/**end repeat**/
200+
57201
/**begin repeat
58202
* Unsigned types
59203
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#

0 commit comments

Comments
 (0)
0