8000 ENH, SIMD: Dispatch for unsigned floor division by ganesh-k13 · Pull Request #18075 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH, SIMD: Dispatch for unsigned floor division #18075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -215,5 +215,6 @@ numpy/core/src/_simd/_simd_inc.h
# umath module
numpy/core/src/umath/loops_unary_fp.dispatch.c
numpy/core/src/umath/loops_arithm_fp.dispatch.c
numpy/core/src/umath/loops_arithmetic.dispatch.c
numpy/core/src/umath/loops_trigonometric.dispatch.c
numpy/core/src/umath/loops_exponent_log.dispatch.c
15 changes: 8 additions & 7 deletions benchmarks/benchmarks/bench_ufunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,19 @@ def time_less_than_scalar2(self, dtype):


class CustomScalarFloorDivideInt(Benchmark):
params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0])
params = (np.sctypes['int'] + np.sctypes['uint'], [8, -8, 43, -43])
param_names = ['dtype', 'divisors']
max_value = 10**7
min_value = -10**7

def setup(self, dtype, divisor):
if dtype in np.sctypes['uint'] and divisor < 0:
raise NotImplementedError(
"Skipping test for negative divisor with unsigned type")

iinfo = np.iinfo(dtype)
self.x = np.arange(
max(iinfo.min, self.min_value),
min(iinfo.max, self.max_value), dtype=dtype)
self.x = np.random.randint(
iinfo.min, iinfo.max, size=10000, dtype=dtype)

def time_floor_divide_int(self, dtpye, divisor):
def time_floor_divide_int(self, dtype, divisor):
self.x // divisor


Expand Down
6 changes: 5 additions & 1 deletion numpy/core/code_generators/generate_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ def english_upper(s):
O = 'O'
P = 'P'
ints = 'bBhHiIlLqQ'
sints = 'bhilq'
uints = 'BHILQ'
times = 'Mm'
timedeltaonly = 'm'
intsO = ints + O
Expand Down Expand Up @@ -325,7 +327,9 @@ def english_upper(s):
Ufunc(2, 1, None, # One is only a unit to the right, not the left
docstrings.get('numpy.core.umath.floor_divide'),
'PyUFunc_DivisionTypeResolver',
TD(intfltcmplx),
TD(uints, cfunc_alias='divide',
dispatch=[('loops_arithmetic', 'BHILQ')]),
TD(sints + flts + cmplx),
[TypeDescription('m', FullTypeDescr, 'mq', 'm'),
TypeDescription('m', FullTypeDescr, 'md', 'm'),
TypeDescription('m', FullTypeDescr, 'mm', 'q'),
Expand Down
1 change: 1 addition & 0 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,7 @@ def generate_umath_c(ext, build_dir):
join('src', 'umath', 'loops.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
Expand Down
16 changes: 0 additions & 16 deletions numpy/core/src/umath/loops.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1014,22 +1014,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
}

NPY_NO_EXPORT void
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
if (in2 == 0) {
npy_set_floatstatus_divbyzero();
*((@type@ *)op1) = 0;
}
else {
*((@type@ *)op1)= in1/in2;
}
}
}

NPY_NO_EXPORT void
@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
Expand Down
13 changes: 12 additions & 1 deletion numpy/core/src/umath/loops.h.src
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
*****************************************************************************
*/

#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_arithmetic.dispatch.h"
#endif

/**begin repeat
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
*/
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
/**end repeat**/

/**begin repeat
* #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
*/
Expand Down Expand Up @@ -141,7 +152,7 @@ NPY_NO_EXPORT void
@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

NPY_NO_EXPORT void
@S@@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));

NPY_NO_EXPORT void
@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
Expand Down
118 changes: 118 additions & 0 deletions numpy/core/src/umath/loops_arithmetic.dispatch.c.src
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*@targets
** $maxopt baseline
** sse2 sse41 avx2 avx512f avx512_skx
** vsx2
** neon
**/
#define _UMATHMODULE
#define _MULTIARRAYMODULE
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

#include "simd/simd.h"
#include "loops_utils.h"
#include "loops.h"
#include "lowlevel_strided_loops.h"
// Provides the various *_LOOP macros
#include "fast_loop_macros.h"

//###############################################################################
//## Division
//###############################################################################
/********************************************************************************
** Defining the SIMD kernels
********************************************************************************/
#if NPY_SIMD
/**begin repeat
* #sfx = u8, u16, u32, u64#
*/
static NPY_INLINE void
simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
{
npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0];
npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2];
const int vstep = npyv_nlanes_@sfx@;
const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);

for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
npyv_@sfx@ a = npyv_load_@sfx@(src);
npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
npyv_store_@sfx@(dst, c);
}

for (; len > 0; --len, ++src, ++dst) {
const npyv_lanetype_@sfx@ a = *src;
*dst = a / scalar;
}

npyv_cleanup();
}
/**end repeat**/
#endif

/********************************************************************************
** Defining ufunc inner functions
********************************************************************************/

/**begin repeat
* Unsigned types
* #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
* #STYPE = BYTE, SHORT, INT, LONG, LONGLONG#
*/
#undef TO_SIMD_SFX
#if 0
/**begin repeat1
* #len = 8, 16, 32, 64#
*/
#elif NPY_BITSOF_@STYPE@ == @len@
#define TO_SIMD_SFX(X) X##_u@len@
/**end repeat1**/
#endif
/*
* For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
* because emulating multiply-high on these architectures is going to be expensive comparing
* to the native scalar dividers.
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
* Power10(VSX4) is an exception here since it has native support for integer vector division,
* note neither infrastructure nor NPYV has supported VSX4 yet.
*/
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
#undef TO_SIMD_SFX
#endif
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
if (IS_BINARY_REDUCE) {
BINARY_REDUCE_LOOP(@type@) {
const @type@ d = *(@type@ *)ip2;
if (NPY_UNLIKELY(d == 0)) {
npy_set_floatstatus_divbyzero();
io1 = 0;
} else {
io1 /= d;
}
}
*((@type@ *)iop1) = io1;
}
#if NPY_SIMD && defined(TO_SIMD_SFX)
// for contiguous block of memory, divisor is a scalar and not 0
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
(*(@type@ *)args[1]) != 0) {
TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
}
#endif
else {
BINARY_LOOP {
const @type@ in1 = *(@type@ *)ip1;
const @type@ in2 = *(@type@ *)ip2;
if (NPY_UNLIKELY(in2 == 0)) {
npy_set_floatstatus_divbyzero();
*((@type@ *)op1) = 0;
} else{
*((@type@ *)op1) = in1 / in2;
}
}
}
}
/**end repeat**/
17 changes: 13 additions & 4 deletions numpy/core/tests/test_umath.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,22 @@ def test_division_int(self):
assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])

@pytest.mark.parametrize("input_dtype",
[np.int8, np.int16, np.int32, np.int64])
np.sctypes['int'] + np.sctypes['uint'])
def test_division_int_boundary(self, input_dtype):
iinfo = np.iinfo(input_dtype)

# Unsigned:
# Create list with 0, 25th, 50th, 75th percentile and max
if iinfo.min == 0:
lst = [0, iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
divisors = [iinfo.max//4, iinfo.max//2,
int(iinfo.max/1.33), iinfo.max]
# Signed:
# Create list with min, 25th percentile, 0, 75th percentile, max
lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
else:
lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
a = np.array(lst, dtype=input_dtype)

for divisor in divisors:
Expand Down Expand Up @@ -926,7 +935,7 @@ def test_log_values(self):
assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
assert_raises(FloatingPointError, np.log, np.float32(-1.0))

# See https://github.com/numpy/numpy/issues/18005
# See https://github.com/numpy/numpy/issues/18005
with assert_no_warnings():
a = np.array(1e9, dtype='float32')
np.log(a)
Expand Down
0