|
138 | 138 | tout * out = (tout *)op1; \
|
139 | 139 | op; \
|
140 | 140 | }
|
| 141 | +/* |
| 142 | + * unfortunately gcc 6/7 regressed and we need to give it additional hints to |
| 143 | + * vectorize inplace operations (PR80198) |
| 144 | + * must only be used after op1 == ip1 or ip2 has been checked |
| 145 | + * TODO: using ivdep might allow other compilers to vectorize too |
| 146 | + */ |
| 147 | +#if __GNUC__ >= 6 |
| 148 | +#define IVDEP_LOOP _Pragma("GCC ivdep") |
| 149 | +#else |
| 150 | +#define IVDEP_LOOP |
| 151 | +#endif |
| 152 | +#define BASE_BINARY_LOOP_INP(tin, tout, op) \ |
| 153 | + char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\ |
| 154 | + npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\ |
| 155 | + npy_intp n = dimensions[0];\ |
| 156 | + npy_intp i;\ |
| 157 | + IVDEP_LOOP \ |
| 158 | + for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \ |
| 159 | + const tin in1 = *(tin *)ip1; \ |
| 160 | + const tin in2 = *(tin *)ip2; \ |
| 161 | + tout * out = (tout *)op1; \ |
| 162 | + op; \ |
| 163 | + } |
141 | 164 | #define BASE_BINARY_LOOP_S(tin, tout, cin, cinp, vin, vinp, op) \
|
142 | 165 | const tin cin = *(tin *)cinp; \
|
143 | 166 | BINARY_LOOP { \
|
144 | 167 | const tin vin = *(tin *)vinp; \
|
145 | 168 | tout * out = (tout *)op1; \
|
146 | 169 | op; \
|
147 | 170 | }
|
| 171 | +/* PR80198 again, scalar works without the pragma */ |
| 172 | +#define BASE_BINARY_LOOP_S_INP(tin, tout, cin, cinp, vin, vinp, op) \ |
| 173 | + const tin cin = *(tin *)cinp; \ |
| 174 | + BINARY_LOOP { \ |
| 175 | + const tin vin = *(tin *)vinp; \ |
| 176 | + tout * out = (tout *)vinp; \ |
| 177 | + op; \ |
| 178 | + } |
148 | 179 | #define BINARY_LOOP_FAST(tin, tout, op) \
|
149 | 180 | do { \
|
150 | 181 | /* condition allows compiler to optimize the generic macro */ \
|
151 | 182 | if (IS_BINARY_CONT(tin, tout)) { \
|
152 | 183 | if (args[2] == args[0]) { \
|
153 |
| - BASE_BINARY_LOOP(tin, tout, op) \ |
| 184 | + BASE_BINARY_LOOP_INP(tin, tout, op) \ |
154 | 185 | } \
|
155 | 186 | else if (args[2] == args[1]) { \
|
156 |
| - BASE_BINARY_LOOP(tin, tout, op) \ |
| 187 | + BASE_BINARY_LOOP_INP(tin, tout, op) \ |
157 | 188 | } \
|
158 | 189 | else { \
|
159 | 190 | BASE_BINARY_LOOP(tin, tout, op) \
|
160 | 191 | } \
|
161 | 192 | } \
|
162 | 193 | else if (IS_BINARY_CONT_S1(tin, tout)) { \
|
163 | 194 | if (args[1] == args[2]) { \
|
164 |
| - BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \ |
| 195 | + BASE_BINARY_LOOP_S_INP(tin, tout, in1, args[0], in2, ip2, op) \ |
165 | 196 | } \
|
166 | 197 | else { \
|
167 | 198 | BASE_BINARY_LOOP_S(tin, tout, in1, args[0], in2, ip2, op) \
|
168 | 199 | } \
|
169 | 200 | } \
|
170 | 201 | else if (IS_BINARY_CONT_S2(tin, tout)) { \
|
171 | 202 | if (args[0] == args[2]) { \
|
172 |
| - BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \ |
| 203 | + BASE_BINARY_LOOP_S_INP(tin, tout, in2, args[1], in1, ip1, op) \ |
173 | 204 | } \
|
174 | 205 | else { \
|
175 | 206 | BASE_BINARY_LOOP_S(tin, tout, in2, args[1], in1, ip1, op) \
|
|
0 commit comments