8000 Merge pull request #8852 from juliantaylor/gcc-inplace-fix · numpy/numpy@4497d47 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4497d47

Browse files
authored
Merge pull request #8852 from juliantaylor/gcc-inplace-fix
MAINT: restore auto-vectorization of inplace operations
2 parents 1e8143c + 1964b6a commit 4497d47

File tree

1 file changed

+35
-4
lines changed

1 file changed

+35
-4
lines changed

numpy/core/src/umath/loops.c.src

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,38 +138,69 @@
138138
tout * out = (tout *)op1; \
139139
op; \
140140
}
141+
/*
142+
* unfortunately gcc 6/7 regressed and we need to give it additional hints to
143+
* vectorize inplace operations (PR80198)
144+
* must only be used after op1 == ip1 or ip2 has been checked
145+
* TODO: using ivdep might allow other compilers to vectorize too
146+
*/
147+
#if __GNUC__ >= 6
148+
#define IVDEP_LOOP _Pragma("GCC ivdep")
149+
#else
150+
#define IVDEP_LOOP
151+
#endif
152+
#define BASE_BINARY_LOOP_INP(tin, tout, op) \
153+
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
154+
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
155+
npy_intp n = dimensions[0];\
156+
npy_intp i;\
157+
IVDEP_LOOP \
158+
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
159+
const tin in1 = *(tin *)ip1; \
160+
const tin in2 = *(tin *)ip2; \
161+
tout * out = (tout *)op1; \
162+
op; \
163+
}
141164
#define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
142165
const tin cin = *(tin *)cinp; \
143166
BINARY_LOOP { \
144167
const tin vin = *(tin *)vinp; \
145168
tout * out = (tout *)op1; \
146169
op; \
147170
}
171+
/* PR80198 again, scalar works without the pragma */
172+
#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \
173+
const tin cin = *(tin *)cinp; \
174+
BINARY_LOOP { \
175+
const tin vin = *(tin *)vinp; \
176+
tout * out = (tout *)vinp; \
177+
op; \
178+
}
148179
#define BINARY_LOOP_FAST(tin, tout, op) \
149180
do { \
150181
/* condition allows compiler to optimize the generic macro */ \
151182
if (IS_BINARY_CONT(tin, tout)) { \
152183
if (args[2] == args[0]) { \
153-
BASE_BINARY_LOOP(tin, tout, op) \
184+
BASE_BINARY_LOOP_INP(tin, tout, op) \
154185
} \
155186
else if (args[2] == args[1]) { \
156-
BASE_BINARY_LOOP(tin, tout, op) \
187+
BASE_BINARY_LOOP_INP(tin, tout, op) \
157188
} \
158189
else { \
159190
BASE_BINARY_LOOP(tin, tout, op) \
160191
} \
161192
} \
162193
else if (IS_BINARY_CONT_S1(tin, tout)) { \
163194
if (args[1] == args[2]) { \
164-
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
195+
BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \
165196
} \
166197
else { \
167198
BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
168199
} \
169200
} \
170201
else if (IS_BINARY_CONT_S2(tin, tout)) { \
171202
if (args[0] == args[2]) { \
172-
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
203+
BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \
173204
} \
174205
else { \
175206
BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \

0 commit comments

Comments
 (0)
0