8000 Undo the remaining changes from "SIMD: Force inlining all functions t… · numpy/numpy@bbd571a · GitHub
[go: up one dir, main page]

Skip to content

Commit bbd571a

Browse files
committed
Undo the remaining changes from "SIMD: Force inlining all functions that accept AVX registers"
These changes are not present in `main`. I see no commits likely to have specifically changed whether these SIMD functions are inlined. Adding these back to `main` is left for another PR. The symptoms I saw were segfaults, basically because function calls do not preserve alignment information.
1 parent 5b8cb3a commit bbd571a

File tree

3 files changed

+97
-97
lines changed

3 files changed

+97
-97
lines changed

numpy/core/src/umath/loops_arithm_fp.dispatch.c.src

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
565565
#endif
566566

567567
#ifdef AVX512F_NOMSVC
568-
NPY_FINLINE __mmask16
568+
static NPY_INLINE __mmask16
569569
avx512_get_full_load_mask_ps(void)
570570
{
571571
return 0xFFFF;
572572
}
573573

574-
NPY_FINLINE __mmask8
574+
static NPY_INLINE __mmask8
575575
avx512_get_full_load_mask_pd(void)
576576
{
577577
return 0xFF;
578578
}
579-
NPY_FINLINE __m512
579+
static NPY_INLINE __m512
580580
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
581581
{
582582
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
583583
}
584584

585-
NPY_FINLINE __m512d
585+
static NPY_INLINE __m512d
586586
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
587587
{
588588
return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
589589
}
590590

591-
NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
591+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
592592
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
593593
{
594594
return (0x0001 << num_elem) - 0x0001;
595595
}
596596

597-
NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
597+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
598598
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
599599
{
600600
return (0x01 << num_elem) - 0x01;
@@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem
613613
* #INF = NPY_INFINITYF, NPY_INFINITY#
614614
* #NAN = NPY_NANF, NPY_NAN#
615615
*/
616-
NPY_FINLINE @vtype@
616+
static @vtype@
617617
avx512_hadd_@vsub@(const @vtype@ x)
618618
{
619619
return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
620620
}
621621

622-
NPY_FINLINE @vtype@
622+
static @vtype@
623623
avx512_hsub_@vsub@(const @vtype@ x)
624624
{
625625
return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
626626
}
627-
NPY_FINLINE @vtype@
627+
static NPY_INLINE @vtype@
628628
avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
629629
{
630630
// x1 = r1, i1

numpy/core/src/umath/loops_exponent_log.dispatch.c.src

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,19 @@
4545

4646
#ifdef SIMD_AVX2_FMA3
4747

48-
NPY_FINLINE __m256
48+
static NPY_INLINE __m256
4949
fma_get_full_load_mask_ps(void)
5050
{
5151
return _mm256_set1_ps(-1.0);
5252
}
5353

54-
NPY_FINLINE __m256i
54+
static NPY_INLINE __m256i
5555
fma_get_full_load_mask_pd(void)
5656
{
5757
return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
5858
}
5959

60-
NPY_FINLINE __m256
60+
static NPY_INLINE __m256
6161
fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
6262
{
6363
float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -66,15 +66,15 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
6666
return _mm256_loadu_ps(addr);
6767
}
6868

69-
NPY_FINLINE __m256i
69+
static NPY_INLINE __m256i
7070
fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
7171
{
7272
npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
7373
npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
7474
return _mm256_loadu_si256((__m256i*) addr);
7575
}
7676

77-
NPY_FINLINE __m256
77+
static NPY_INLINE __m256
7878
fma_masked_gather_ps(__m256 src,
7979
npy_float* addr,
8080
__m256i vindex,
@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,
8383
return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
8484
}
8585

86-
NPY_FINLINE __m256d
86+
static NPY_INLINE __m256d
8787
fma_masked_gather_pd(__m256d src,
8888
npy_double* addr,
8989
__m128i vindex,
@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,
9292
return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
F438 9393
}
9494

95-
NPY_FINLINE __m256
95+
static NPY_INLINE __m256
9696
fma_masked_load_ps(__m256 mask, npy_float* addr)
9797
{
9898
return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
9999
}
100100

101-
NPY_FINLINE __m256d
101+
static NPY_INLINE __m256d
102102
fma_masked_load_pd(__m256i mask, npy_double* addr)
103103
{
104104
return _mm256_maskload_pd(addr, mask);
105105
}
106106

107-
NPY_FINLINE __m256
107+
static NPY_INLINE __m256
108108
fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
109109
{
110110
return _mm256_blendv_ps(x, val, mask);
111111
}
112112

113-
NPY_FINLINE __m256d
113+
static NPY_INLINE __m256d
114114
fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
115115
{
116116
return _mm256_blendv_pd(x, val, mask);
117117
}
118118

119-
NPY_FINLINE __m256
119+
static NPY_INLINE __m256
120120
fma_blend(__m256 x, __m256 y, __m256 ymask)
121121
{
122122
return _mm256_blendv_ps(x, y, ymask);
123123
}
124124

125-
NPY_FINLINE __m256
125+
static NPY_INLINE __m256
126126
fma_invert_mask_ps(__m256 ymask)
127127
{
128128
return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
129129
}
130130

131-
NPY_FINLINE __m256i
131+
static NPY_INLINE __m256i
132132
fma_invert_mask_pd(__m256i ymask)
133133
{
134134
return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
135135
}
136136

137-
NPY_FINLINE __m256
137+
static NPY_INLINE __m256
138138
fma_get_exponent(__m256 x)
139139
{
140140
/*
@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)
165165
return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
166166
}
167167

168-
NPY_FINLINE __m256
168+
static NPY_INLINE __m256
169169
fma_get_mantissa(__m256 x)
170170
{
171171
/*
@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)
195195
_mm256_castps_si256(x), mantissa_bits), exp_126_bits));
196196
}
197197

198-
NPY_FINLINE __m256
198+
static NPY_INLINE __m256
199199
fma_scalef_ps(__m256 poly, __m256 quadrant)
200200
{
201201
/*
@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
238238

239239
#ifdef SIMD_AVX512F
240240

241-
NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
241+
static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
242242
avx512_get_full_load_mask_ps(void)
243243
{
244244
return 0xFFFF;
245245
}
246246

247-
NPY_FINLINE __mmask8
247+
static NPY_INLINE __mmask8
248248
avx512_get_full_load_mask_pd(void)
249249
{
250250
return 0xFF;
251251
}
252252

253-
NPY_FINLINE __mmask16
253+
static NPY_INLINE __mmask16
254254
avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
255255
{
256256
return (0x0001 << num_elem) - 0x0001;
257257
}
258258

259-
NPY_FINLINE __mmask8
259+
static NPY_INLINE __mmask8
260260
avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
261261
{
262262
return (0x01 << num_elem) - 0x01;
263263
}
264264

265-
NPY_FINLINE __m512
265+
static NPY_INLINE __m512
266266
avx512_masked_gather_ps(__m512 src,
267267
npy_float* addr,
268268
__m512i vindex,
@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,
271271
return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
272272
}
273273

274-
NPY_FINLINE __m512d
274+
static NPY_INLINE __m512d
275275
avx512_masked_gather_pd(__m512d src,
276276
npy_double* addr,
277277
__m256i vindex,
@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,
280280
return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
281281
}
282282

283-
NPY_FINLINE __m512
283+
static NPY_INLINE __m512
284284
avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
285285
{
286286
return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
287287
}
288288

289-
NPY_FINLINE __m512d
289+
static NPY_INLINE __m512d
290290
avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
291291
{
292292
return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
293293
}
294294

295-
NPY_FINLINE __m512
295+
static NPY_INLINE __m512
296296
avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
297297
{
298298
return _mm512_mask_blend_ps(mask, x, val);
299299
}
300300

301-
NPY_FINLINE __m512d
301+
static NPY_INLINE __m512d
302302
avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
303303
{
304304
return _mm512_mask_blend_pd(mask, x, val);
305305
}
306306

307-
NPY_FINLINE __m512
307+
static NPY_INLINE __m512
308308
avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
309309
{
310310
return _mm512_mask_mov_ps(x, ymask, y);
311311
}
312312

313-
NPY_FINLINE __mmask16
313+
static NPY_INLINE __mmask16
314314
avx512_invert_mask_ps(__mmask16 ymask)
315315
{
316316
return _mm512_knot(ymask);
317317
}
318318

319-
NPY_FINLINE __mmask8
319+
static NPY_INLINE __mmask8
320320
avx512_invert_mask_pd(__mmask8 ymask)
321321
{
322322
return _mm512_knot(ymask);
323323
}
324324

325-
NPY_FINLINE __m512
325+
static NPY_INLINE __m512
326326
avx512_get_exponent(__m512 x)
327327
{
328328
return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
329329
}
330330

331-
NPY_FINLINE __m512
331+
static NPY_INLINE __m512
332332
avx512_get_mantissa(__m512 x)
333333
{
334334
return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
335335
}
336336

337-
NPY_FINLINE __m512
337+
static NPY_INLINE __m512
338338
avx512_scalef_ps(__m512 poly, __m512 quadrant)
339339
{
340340
return _mm512_scalef_ps(poly, quadrant);
341341
}
342342

343-
NPY_FINLINE __m512d
343+
static NPY_INLINE __m512d
344344
avx512_permute_x4var_pd(__m512d t0,
345345
__m512d t1,
346346
__m512d t2,
@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,
355355
return _mm512_mask_blend_pd(lut_mask, res1, res2);
356356
}
357357

358-
NPY_FINLINE __m512d
358+
static NPY_INLINE __m512d
359359
avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
360360
__m512d t4, __m512d t5, __m512d t6, __m512d t7,
361361
__m512i index)
@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
401401
* 3) x* = x - y*c3
402402
* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
403403
*/
404-
NPY_FINLINE @vtype@
404+
static NPY_INLINE @vtype@
405405
simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
406406
{
407407
@vtype@ reduced_x = @fmadd@(y, c1, x);

0 commit comments

Comments
 (0)
0