diff --git a/.gitignore b/.gitignore index 05df19335be2..736597b6b7af 100644 --- a/.gitignore +++ b/.gitignore @@ -215,5 +215,6 @@ numpy/core/src/_simd/_simd_inc.h # umath module numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c +numpy/core/src/umath/loops_arithmetic.dispatch.c numpy/core/src/umath/loops_trigonometric.dispatch.c numpy/core/src/umath/loops_exponent_log.dispatch.c diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py index 13b7382a1708..b036581e1aae 100644 --- a/benchmarks/benchmarks/bench_ufunc.py +++ b/benchmarks/benchmarks/bench_ufunc.py @@ -135,18 +135,19 @@ def time_less_than_scalar2(self, dtype): class CustomScalarFloorDivideInt(Benchmark): - params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0]) + params = (np.sctypes['int'] + np.sctypes['uint'], [8, -8, 43, -43]) param_names = ['dtype', 'divisors'] - max_value = 10**7 - min_value = -10**7 def setup(self, dtype, divisor): + if dtype in np.sctypes['uint'] and divisor < 0: + raise NotImplementedError( + "Skipping test for negative divisor with unsigned type") + iinfo = np.iinfo(dtype) - self.x = np.arange( - max(iinfo.min, self.min_value), - min(iinfo.max, self.max_value), dtype=dtype) + self.x = np.random.randint( + iinfo.min, iinfo.max, size=10000, dtype=dtype) - def time_floor_divide_int(self, dtpye, divisor): + def time_floor_divide_int(self, dtype, divisor): self.x // divisor diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index b5305fbfce98..57c811ff3306 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -245,6 +245,8 @@ def english_upper(s): O = 'O' P = 'P' ints = 'bBhHiIlLqQ' +sints = 'bhilq' +uints = 'BHILQ' times = 'Mm' timedeltaonly = 'm' intsO = ints + O @@ -325,7 +327,9 @@ def english_upper(s): Ufunc(2, 1, None, # One is only a unit to the right, not the left docstrings.get('numpy.core.umath.floor_divide'), 'PyUFunc_DivisionTypeResolver', - TD(intfltcmplx), + TD(uints, cfunc_alias='divide', + dispatch=[('loops_arithmetic', 'BHILQ')]), + TD(sints + flts + cmplx), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), TypeDescription('m', FullTypeDescr, 'mm', 'q'), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 8c34a3286d72..df405bcaf487 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -931,6 +931,7 @@ def generate_umath_c(ext, build_dir): join('src', 'umath', 'loops.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), + join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_exponent_log.dispatch.c.src'), join('src', 'umath', 'matmul.h.src'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 68e209fe9312..04665dc5296e 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1014,22 +1014,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0); } -NPY_NO_EXPORT void -@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - BINARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - const @type@ in2 = *(@type@ *)ip2; - if (in2 == 0) { - npy_set_floatstatus_divbyzero(); - *((@type@ *)op1) = 0; - } - else { - *((@type@ *)op1)= in1/in2; - } - } -} - NPY_NO_EXPORT void @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index b3a19be12d62..0301aa5ed7b8 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -53,6 +53,17 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void ***************************************************************************** */ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_arithmetic.dispatch.h" +#endif + +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) +/**end repeat**/ + /**begin repeat * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG# */ @@ -141,7 +152,7 @@ NPY_NO_EXPORT void @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src new file mode 100644 index 000000000000..7e9f464636c5 --- /dev/null +++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src @@ -0,0 +1,118 @@ +/*@targets + ** $maxopt baseline + ** sse2 sse41 avx2 avx512f avx512_skx + ** vsx2 + ** neon + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +//############################################################################### +//## Division +//############################################################################### +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +#if NPY_SIMD +/**begin repeat + * #sfx = u8, u16, u32, u64# + */ +static NPY_INLINE void +simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len) +{ + npyv_lanetype_@sfx@ *src = (npyv_lanetype_@sfx@ *) args[0]; + npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1]; + npyv_lanetype_@sfx@ *dst = (npyv_lanetype_@sfx@ *) args[2]; + const int vstep = npyv_nlanes_@sfx@; + const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar); + + for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) { + npyv_@sfx@ a = npyv_load_@sfx@(src); + npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor); + npyv_store_@sfx@(dst, c); + } + + for (; len > 0; --len, ++src, ++dst) { + const npyv_lanetype_@sfx@ a = *src; + *dst = a / scalar; + } + + npyv_cleanup(); +} +/**end repeat**/ +#endif + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ + +/**begin repeat + * Unsigned types + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong# + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG# + * #STYPE = BYTE, SHORT, INT, LONG, LONGLONG# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_BITSOF_@STYPE@ == @len@ + #define TO_SIMD_SFX(X) X##_u@len@ +/**end repeat1**/ +#endif +/* + * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division + * because emulating multiply-high on these architectures is going to be expensive comparing + * to the native scalar dividers. + * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. + * Power10(VSX4) is an exception here since it has native support for integer vector division, + * note neither infrastructure nor NPYV has supported VSX4 yet. + */ +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON)) + #undef TO_SIMD_SFX +#endif +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (IS_BINARY_REDUCE) { + BINARY_REDUCE_LOOP(@type@) { + const @type@ d = *(@type@ *)ip2; + if (NPY_UNLIKELY(d == 0)) { + npy_set_floatstatus_divbyzero(); + io1 = 0; + } else { + io1 /= d; + } + } + *((@type@ *)iop1) = io1; + } +#if NPY_SIMD && defined(TO_SIMD_SFX) + // for contiguous block of memory, divisor is a scalar and not 0 + else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) && + (*(@type@ *)args[1]) != 0) { + TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]); + } +#endif + else { + BINARY_LOOP { + const @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + if (NPY_UNLIKELY(in2 == 0)) { + npy_set_floatstatus_divbyzero(); + *((@type@ *)op1) = 0; + } else{ + *((@type@ *)op1) = in1 / in2; + } + } + } +} +/**end repeat**/ diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 2249c866caf5..b31b84d0cc2b 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -250,13 +250,22 @@ def test_division_int(self): assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80]) @pytest.mark.parametrize("input_dtype", - [np.int8, np.int16, np.int32, np.int64]) + np.sctypes['int'] + np.sctypes['uint']) def test_division_int_boundary(self, input_dtype): iinfo = np.iinfo(input_dtype) + # Unsigned: + # Create list with 0, 25th, 50th, 75th percentile and max + if iinfo.min == 0: + lst = [0, iinfo.max//4, iinfo.max//2, + int(iinfo.max/1.33), iinfo.max] + divisors = [iinfo.max//4, iinfo.max//2, + int(iinfo.max/1.33), iinfo.max] + # Signed: # Create list with min, 25th percentile, 0, 75th percentile, max - lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] - divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] + else: + lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max] + divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max] a = np.array(lst, dtype=input_dtype) for divisor in divisors: @@ -926,7 +935,7 @@ def test_log_values(self): assert_raises(FloatingPointError, np.log, np.float32(-np.inf)) assert_raises(FloatingPointError, np.log, np.float32(-1.0)) - # See https://github.com/numpy/numpy/issues/18005 + # See https://github.com/numpy/numpy/issues/18005 with assert_no_warnings(): a = np.array(1e9, dtype='float32') np.log(a)