20
20
//###############################################################################
21
21
/********************************************************************************
22
22
** Defining the SIMD kernels
23
+ *
24
+ * Floor division of signed is based on T. Granlund and P. L. Montgomery
25
+ * “Division by invariant integers using multiplication(see [Figure 6.1]
26
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
27
+ * For details on TRUNC division see simd/intdiv.h for more clarification
28
+ ***********************************************************************************
29
+ ** Figure 6.1: Signed division by run–time invariant divisor, rounded towards -INF
30
+ ***********************************************************************************
31
+ * For q = FLOOR(a/d), all sword:
32
+ * sword −dsign = SRL(d, N − 1);
33
+ * uword −nsign = (n < −dsign);
34
+ * uword −qsign = EOR(−nsign, −dsign);
35
+ * q = TRUNC((n − (−dsign ) + (−nsign))/d) − (−qsign);
23
36
********************************************************************************/
37
+
24
38
#if NPY_SIMD
25
39
/**begin repeat
26
- * #sfx = u8, u16, u32, u64#
40
+ * Signed types
41
+ * #sfx = s8, s16, s32, s64#
42
+ * #len = 8, 16, 32, 64#
43
+ */
44
+ static NPY_INLINE void
45
+ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
46
+ {
47
+ npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
48
+ npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
49
+ npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
50
+ const int vstep = npyv_nlanes_@sfx@;
51
+ const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
52
+
53
+ if (scalar == -1) {
54
+ npyv_b@len@ noverflow = npyv_cvt_b@len@_@sfx@(npyv_setall_@sfx@(-1));
55
+ npyv_@sfx@ vzero = npyv_zero_@sfx@();
56
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
57
+ npyv_@sfx@ a = npyv_load_@sfx@(src);
58
+ npyv_b@len@ gt_min = npyv_cmpgt_@sfx@(a, npyv_setall_@sfx@(NPY_MIN_INT@len@));
59
+ noverflow = npyv_and_b@len@(noverflow, gt_min);
60
+ npyv_@sfx@ neg = npyv_ifsub_@sfx@(gt_min, vzero, a, vzero);
61
+ npyv_store_@sfx@(dst, neg);
62
+ }
63
+
64
+ int raise_err = npyv_tobits_b@len@(npyv_not_b@len@(noverflow)) != 0;
65
+ for (; len > 0; --len, ++src, ++dst) {
66
+ npyv_lanetype_@sfx@ a = *src;
67
+ if (a == NPY_MIN_INT@len@) {
68
+ raise_err = 1;
69
+ *dst = 0;
70
+ } else {
71
+ *dst = -a;
72
+ }
73
+ }
74
+ if (raise_err) {
75
+ npy_set_floatstatus_divbyzero();
76
+ }
77
+ } else {
78
+ for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
79
+ npyv_@sfx@ nsign_d = npyv_setall_@sfx@(scalar < 0);
80
+ npyv_@sfx@ a = npyv_load_@sfx@(src);
81
+ npyv_@sfx@ nsign_a = npyv_cvt_@sfx@_b@len@(npyv_cmplt_@sfx@(a, nsign_d));
82
+ nsign_a = npyv_and_@sfx@(nsign_a, npyv_setall_@sfx@(1));
83
+ npyv_@sfx@ diff_sign = npyv_sub_@sfx@(nsign_a, nsign_d);
84
+ npyv_@sfx@ to_ninf = npyv_xor_@sfx@(nsign_a, nsign_d);
85
+ npyv_@sfx@ trunc = npyv_divc_@sfx@(npyv_add_@sfx@(a, diff_sign), divisor);
86
+ npyv_@sfx@ floor = npyv_sub_@sfx@(trunc, to_ninf);
87
+ npyv_store_@sfx@(dst, floor);
88
+ }
89
+
90
+ for (; len > 0; --len, ++src, ++dst) {
91
+ const npyv_lanetype_@sfx@ a = *src;
92
+ npyv_lanetype_@sfx@ r = a / scalar;
93
+ // Negative quotients needs to be rounded down
94
+ if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
95
+ r--;
96
+ }
97
+ *dst = r;
98
+ }
99
+ }
100
+ npyv_cleanup();
101
+ }
102
+ /**end repeat**/
103
+
104
+ /**begin repeat
105
+ * Unsigned types
106
+ * #sfx = u8, u16, u32, u64#
107
+ * #len = 8, 16, 32, 64#
27
108
*/
28
109
static NPY_INLINE void
29
110
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
@@ -44,7 +125,6 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
44
125
const npyv_lanetype_@sfx@ a = *src;
45
126
*dst = a / scalar;
46
127
}
47
-
48
128
npyv_cleanup();
49
129
}
50
130
/**end repeat**/
@@ -54,6 +134,70 @@ simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
54
134
** Defining ufunc inner functions
55
135
********************************************************************************/
56
136
137
+ /**begin repeat
138
+ * Signed types
139
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
140
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
141
+ */
142
+ #undef TO_SIMD_SFX
143
+ #if 0
144
+ /**begin repeat1
145
+ * #len = 8, 16, 32, 64#
146
+ */
147
+ #elif NPY_BITSOF_@TYPE@ == @len@
148
+ #define TO_SIMD_SFX(X) X##_s@len@
149
+ /**end repeat1**/
150
+ #endif
151
+
152
+ #if NPY_BITSOF_@TYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
153
+ #undef TO_SIMD_SFX
154
+ #endif
155
+
156
+ NPY_FINLINE @type@ floor_div_@TYPE@(const @type@ n, const @type@ d)
157
+ {
158
+ /*
159
+ * FIXME: On x86 at least, dividing the smallest representable integer
160
+ * by -1 causes a SIFGPE (division overflow). We treat this case here
161
+ * (to avoid a SIGFPE crash at python level), but a good solution would
162
+ * be to treat integer division problems separately from FPU exceptions
163
+ * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
164
+ */
165
+ if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_@TYPE@ && d == -1))) {
166
+ npy_set_floatstatus_divbyzero();
167
+ return 0;
168
+ }
169
+ @type@ r = n / d;
170
+ // Negative quotients needs to be rounded down
171
+ if (((n > 0) != (d > 0)) && ((r * d) != n)) {
172
+ r--;
173
+ }
174
+ return r;
175
+ }
176
+
177
+ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
178
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
179
+ {
180
+ if (IS_BINARY_REDUCE) {
181
+ BINARY_REDUCE_LOOP(@type@) {
182
+ io1 = floor_div_@TYPE@(io1, *(@type@*)ip2);
183
+ }
184
+ *((@type@ *)iop1) = io1;
185
+ }
186
+ #if NPY_SIMD && defined(TO_SIMD_SFX)
187
+ // for contiguous block of memory, divisor is a scalar and not 0
188
+ else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
189
+ (*(@type@ *)args[1]) != 0) {
190
+ TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
191
+ }
192
+ #endif
193
+ else {
194
+ BINARY_LOOP {
195
+ *((@type@ *)op1) = floor_div_@TYPE@(*(@type@*)ip1, *(@type@*)ip2);
196
+ }
197
+ }
198
+ }
199
+ /**end repeat**/
200
+
57
201
/**begin repeat
58
202
* Unsigned types
59
203
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
0 commit comments