8000 Merge pull request #27115 from r-devulap/ldexp · numpy/numpy@e4a495d · GitHub
[go: up one dir, main page]

Skip to content

Commit e4a495d

Browse files
authored
Merge pull request #27115 from r-devulap/ldexp
BUG: Use the new npyv_loadable_stride_ functions for ldexp and frexp
2 parents 7533a4c + bbcedfc commit e4a495d

File tree

3 files changed

+13
-30
lines changed

3 files changed

+13
-30
lines changed

numpy/_core/src/common/simd/avx512/avx512.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
1212
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
1313
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
14+
#define NPY_SIMD_MAXLOAD_STRIDE64 (0x7fffffff / 16)
15+
#define NPY_SIMD_MAXSTORE_STRIDE64 (0x7fffffff / 16)
1416

1517
typedef __m512i npyv_u8;
1618
typedef __m512i npyv_s8;

numpy/_core/src/umath/fast_loop_macros.h

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -323,34 +323,6 @@ abs_ptrdiff(char *a, char *b)
323323
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
324324
((abs_ptrdiff(args[1], args[0]) == 0))))
325325

326-
/*
327-
* Avoid using SIMD for very large step sizes for several reasons:
328-
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
329-
* in which case we need two i64gather instructions and an additional vinsertf32x8
330-
* instruction to load a single zmm register (since one i64gather instruction
331-
* loads into a ymm register). This is not ideal for performance.
332-
* 2) Gather and scatter instructions can be slow when the loads/stores
333-
* cross page boundaries.
334-
*
335-
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
336-
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
337-
* ensures this. The condition also requires that the input and output arrays
338-
* should have no overlap in memory.
339-
*/
340-
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
341-
((labs(steps[0]) < MAX_STEP_SIZE) && \
342-
(labs(steps[1]) < MAX_STEP_SIZE) && \
343-
(labs(steps[2]) < MAX_STEP_SIZE) && \
344-
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
345-
(nomemoverlap(args[1], steps[1], args[2], steps[2], dimensions[0])))
346-
347-
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
348-
((labs(steps[0]) < MAX_STEP_SIZE) && \
349-
(labs(steps[1]) < MAX_STEP_SIZE) && \
350-
(labs(steps[2]) < MAX_STEP_SIZE) && \
351-
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
352-
(nomemoverlap(args[0], steps[0], args[1], steps[1], dimensions[0])))
353-
354326
/*
355327
* 1) Output should be contiguous, can handle strided input data
356328
* 2) Input step should be smaller than MAX_STEP_SIZE for performance

numpy/_core/src/umath/loops_exponent_log.dispatch.c.src

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line 8000 numberDiff line change
@@ -1350,12 +1350,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
13501350
* #TYPE = FLOAT, DOUBLE#
13511351
* #c = f, #
13521352
* #C = F, #
1353+
* #suffix = f32, f64#
13531354
*/
13541355
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
13551356
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
13561357
{
13571358
#ifdef SIMD_AVX512_SKX
1358-
if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
1359+
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
1360+
(npyv_storable_stride_@suffix@(steps[1])) &&
1361+
(npyv_storable_stride_@suffix@(steps[2])) &&
1362+
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
1363+
(!is_mem_overlap(args[0], steps[0], args[1], steps[1], dimensions[0]))) {
13591364
AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
13601365
return;
13611366
}
@@ -1370,7 +1375,11 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
13701375
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
13711376
{
13721377
#ifdef SIMD_AVX512_SKX
1373-
if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
1378+
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
1379+
(npyv_storable_stride_@suffix@(steps[1])) &&
1380+
(npyv_storable_stride_@suffix@(steps[2])) &&
1381+
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
1382+
(!is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]))) {
13741383
AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
13751384
return;
13761385
}

0 commit comments

Comments
 (0)
0