8000 ENH, TST: Bring the NumPy C SIMD vectorization interface "NPYV" to Python by seiko2plus · Pull Request #16782 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH, TST: Bring the NumPy C SIMD vectorization interface "NPYV" to Python #16782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 29, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
MAINT, TST: Serveral imporvments to _SIMD module
 - use plain variables
 - clean up aligned allocate
 - use `PyArg_ParseTuple` for empty args
 - use `Py_ssize_t` instead of `unsigned` and `size_t`
 - improve coding style
 - no need for a custom raises assertions
 - use parametrize instead of inner loops
 - leave a comment about nature of mode testing unit
 - shift to get max/min of int72
 - add more info to repr of vector object
 - get ride of exec() and use type() instead
 - use `.inc` as extension for sub-headers instead of `.h`
 - add `FMA4` and drop `SSE41` from _SIMD targets

Co-authored-by: Eric Wieser <wieser.eric@gmail.com>
  • Loading branch information
seiko2plus and eric-wieser committed Oct 27, 2020
commit 7d125fb70cb149207171b7181312f9679dd4d451
12 changes: 6 additions & 6 deletions numpy/core/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,18 +983,18 @@ def generate_umath_c(ext, build_dir):
join('src', 'common', 'npy_cpu_features.c.src'),
join('src', '_simd', '_simd.c'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_inc_data.h.src'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd.dispatch.c.src'),
], depends=[
join('src', 'common', 'npy_cpu_dispatch.h'),
join('src', 'common', 'simd', 'simd.h'),
join('src', '_simd', '_simd.h'),
join('src', '_simd', '_simd_inc.h.src'),
join('src', '_simd', '_simd_inc_data.h.src'),
join('src', '_simd', '_simd_inc_arg.h'),
join('src', '_simd', '_simd_inc_convert.h'),
join('src', '_simd', '_simd_inc_easyintrin.h'),
join('src', '_simd', '_simd_inc_vector.h'),
join('src', '_simd', '_simd_data.inc.src'),
join('src', '_simd', '_simd_arg.inc'),
join('src', '_simd', '_simd_convert.inc'),
join('src', '_simd', '_simd_easyintrin.inc'),
join('src', '_simd', '_simd_vector.inc'),
])

config.add_subpackage('tests')
Expand Down
34 changes: 16 additions & 18 deletions numpy/core/src/_simd/_simd.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
#include "_simd_inc.h"

#if NPY_SIMD
#include "_simd_inc_data.h"
#include "_simd_inc_convert.h"
#include "_simd_inc_vector.h"
#include "_simd_inc_arg.h"
#include "_simd_inc_easyintrin.h"
#include "_simd_data.inc"
#include "_simd_convert.inc"
#include "_simd_vector.inc"
#include "_simd_arg.inc"
#include "_simd_easyintrin.inc"

/*************************************************************************
* Defining NPYV intrinsics as module functions
Expand Down Expand Up @@ -39,26 +39,24 @@ SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
static PyObject *
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
{
simd_arg req_args[] = {
{.dtype = simd_data_q@sfx@},
{.dtype = simd_data_v@sfx@},
};
simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
if (!PyArg_ParseTuple(
args, "O&O&:@intrin@_@sfx@",
simd_arg_converter, &req_args[0],
simd_arg_converter, &req_args[1]
simd_arg_converter, &seq_arg,
simd_arg_converter, &vec_arg
)) {
return NULL;
}
npyv_@intrin@_@sfx@(
req_args[0].data.q@sfx@, req_args[1].data.v@sfx@
seq_arg.data.q@sfx@, vec_arg.data.v@sfx@
);
// write-back
if (simd_sequence_fill_obj(req_args[0].obj, req_args[0].data.q@sfx@, simd_data_q@sfx@)) {
simd_args_sequence_free(req_args, 2);
if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
simd_arg_free(&seq_arg);
return NULL;
}
simd_args_sequence_free(req_args, 2);
simd_arg_free(&seq_arg);
Py_RETURN_NONE;
}
/**end repeat1**/
Expand Down Expand Up @@ -89,7 +87,7 @@ SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
static PyObject *
simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
{
npyv_lanetype_@sfx@ *data = simd_sequence_from_obj(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
if (data == NULL) {
return NULL;
}
Expand All @@ -105,7 +103,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
data[64] // for setf
)};
simd_sequence_free(data);
return (PyObject*)simd_vector_to_obj(r, simd_data_v@sfx@);
return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@);
}
/**end repeat1**/

Expand Down Expand Up @@ -354,7 +352,7 @@ NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
goto err;
}
#if NPY_SIMD
if (simd_vector_register(m)) {
if (PySIMDVectorType_Init(m)) {
goto err;
}
/**begin repeat
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#if !NPY_SIMD
#error "Not a standalone header, only works through 'simd.dispatch.c.src'"
#endif

/**
* This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
* therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
* symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
* deemed too harmful to readability.
*/
/************************************
** Protected Definitions
************************************/
Expand All @@ -11,17 +13,17 @@ simd_arg_from_obj(PyObject *obj, simd_arg *arg)
assert(arg->dtype != 0);
const simd_data_info *info = simd_data_getinfo(arg->dtype);
if (info->is_scalar) {
arg->data = simd_scalar_from_obj(obj, arg->dtype);
arg->data = simd_scalar_from_number(obj, arg->dtype);
}
else if (info->is_sequence) {
unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
arg->data.qu8 = simd_sequence_from_obj(obj, arg->dtype, min_seq_size);
arg->data.qu8 = simd_sequence_from_iterable(obj, arg->dtype, min_seq_size);
}
else if (info->is_vectorx) {
arg->data = simd_vectorx_from_obj(obj, arg->dtype);
arg->data = simd_vectorx_from_tuple(obj, arg->dtype);
}
else if (info->is_vector) {
arg->data = simd_vector_from_obj((simd_vector*)obj, arg->dtype);
arg->data = PySIMDVector_AsData((PySIMDVectorObject*)obj, arg->dtype);
} else {
arg->data.u64 = 0;
PyErr_Format(PyExc_RuntimeError,
Expand All @@ -41,16 +43,16 @@ simd_arg_to_obj(const simd_arg *arg)
assert(arg->dtype != 0);
const simd_data_info *info = simd_data_getinfo(arg->dtype);
if (info->is_scalar) {
return simd_scalar_to_obj(arg->data, arg->dtype);
return simd_scalar_to_number(arg->data, arg->dtype);
}
if (info->is_sequence) {
return simd_sequence_to_obj(arg->data.qu8, arg->dtype);
return simd_sequence_to_list(arg->data.qu8, arg->dtype);
}
if (info->is_vectorx) {
return simd_vectorx_to_obj(arg->data, arg->dtype);
return simd_vectorx_to_tuple(arg->data, arg->dtype);
}
if (info->is_vector) {
return (PyObject*)simd_vector_to_obj(arg->data, arg->dtype);
return (PyObject*)PySIMDVector_FromData(arg->data, arg->dtype);
}
PyErr_Format(PyExc_RuntimeError,
"unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
Expand All @@ -59,15 +61,10 @@ simd_arg_to_obj(const simd_arg *arg)
}

static void
simd_args_sequence_free(simd_arg *args, int args_len)
simd_arg_free(simd_arg *arg)
{
assert(args_len > 0);
while (--args_len >= 0) {
simd_arg *arg = &args[args_len];
const simd_data_info *info = simd_data_getinfo(arg->dtype);
if (!info->is_sequence) {
continue;
}
const simd_data_info *info = simd_data_getinfo(arg->dtype);
if (info->is_sequence) {
simd_sequence_free(arg->data.qu8);
}
}
Expand All @@ -82,7 +79,7 @@ simd_arg_converter(PyObject *obj, simd_arg *arg)
arg->obj = obj;
return Py_CLEANUP_SUPPORTED;
} else {
simd_args_sequence_free(arg, 1);
simd_arg_free(arg);
}
return 1;
}
BEA4
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#if !NPY_SIMD
#error "Not a standalone header, only works through 'simd.dispatch.c.src'"
#endif

/**
* This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
* therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
* symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
* deemed too harmful to readability.
*/
/************************************
** Protected Definitions
************************************/
static simd_data
simd_scalar_from_obj(PyObject *obj, simd_data_type dtype)
simd_scalar_from_number(PyObject *obj, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
assert(info->is_scalar && info->lane_size > 0);
Expand All @@ -23,7 +25,7 @@ simd_scalar_from_obj(PyObject *obj, simd_data_type dtype)
}

static PyObject *
simd_scalar_to_obj(simd_data data, simd_data_type dtype)
simd_scalar_to_number(simd_data data, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
assert(info->is_scalar && info->lane_size > 0);
Expand All @@ -41,43 +43,44 @@ simd_scalar_to_obj(simd_data data, simd_data_type dtype)
return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
}

typedef struct {
Py_ssize_t len;
void *ptr;
} simd__alloc_data;

static void *
simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
assert(info->is_sequence && info->lane_size > 0);

size_t size = NPY_SIMD_WIDTH + sizeof(size_t) + sizeof(size_t*);
size += len * info->lane_size;

size_t *ptr = malloc(size);
assert(len > 0 && info->is_sequence && info->lane_size > 0);
size_t size = sizeof(simd__alloc_data) + len * info->lane_size + NPY_SIMD_WIDTH;
void *ptr = malloc(size);
if (ptr == NULL) {
return PyErr_NoMemory();
}
*(ptr++) = len;
size_t **a_ptr = (size_t**)(
((size_t)ptr + NPY_SIMD_WIDTH) & ~(size_t)(NPY_SIMD_WIDTH-1)
// align the pointer
simd__alloc_data *a_ptr = (simd__alloc_data *)(
((uintptr_t)ptr + sizeof(simd__alloc_data) + NPY_SIMD_WIDTH) & ~(uintptr_t)(NPY_SIMD_WIDTH-1)
);
a_ptr[-1] = ptr;
a_ptr[-1].len = len;
a_ptr[-1].ptr = ptr;
return a_ptr;
}

static size_t
simd_sequence_len(const void *ptr)
static Py_ssize_t
simd_sequence_len(void const *ptr)
{
size_t *ptrz = ((size_t**)ptr)[-1];
return *(ptrz-1);
return ((simd__alloc_data const*)ptr)[-1].len;
}

static void
simd_sequence_free(void *ptr)
{
size_t *ptrz = ((size_t**)ptr)[-1];
free(ptrz-1);
free(((simd__alloc_data *)ptr)[-1].ptr);
}

static void *
simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size)
{
const simd_data_info *info = simd_data_getinfo(dtype);
assert(info->is_sequence && info->lane_size > 0);
Expand All @@ -86,7 +89,7 @@ simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
return NULL;
}
Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
if (seq_size < (Py_ssize_t)min_size) {
if (seq_size < min_size) {
PyErr_Format(PyExc_ValueError,
"minimum acceptable size of the required sequence is %d, given(%d)",
min_size, seq_size
Expand All @@ -99,7 +102,7 @@ simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
}
PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
for (Py_ssize_t i = 0; i < seq_size; ++i) {
simd_data data = simd_scalar_from_obj(seq_items[i], info->to_scalar);
simd_data data = simd_scalar_from_number(seq_items[i], info->to_scalar);
npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
memcpy(sdst, &data.u64, info->lane_size);
}
Expand All @@ -113,7 +116,7 @@ simd_sequence_from_obj(PyObject *obj, simd_data_type dtype, unsigned min_size)
}

static int
simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype)
simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
if (!PySequence_Check(obj)) {
Expand All @@ -123,12 +126,12 @@ simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype)
return -1;
}
const npyv_lanetype_u8 *src = ptr;
Py_ssize_t seq_len = (Py_ssize_t)simd_sequence_len(ptr);
Py_ssize_t seq_len = simd_sequence_len(ptr);
for (Py_ssize_t i = 0; i < seq_len; ++i) {
const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
simd_data data;
memcpy(&data.u64, ssrc, info->lane_size);
PyObject *item = simd_scalar_to_obj(data, info->to_scalar);
PyObject *item = simd_scalar_to_number(data, info->to_scalar);
if (item == NULL) {
return -1;
}
Expand All @@ -141,21 +144,21 @@ simd_sequence_fill_obj(PyObject *obj, const void *ptr, simd_data_type dtype)
}

static PyObject *
simd_sequence_to_obj(const void *ptr, simd_data_type dtype)
simd_sequence_to_list(const void *ptr, simd_data_type dtype)
{
PyObject *list = PyList_New((Py_ssize_t)simd_sequence_len(ptr));
PyObject *list = PyList_New(simd_sequence_len(ptr));
if (list == NULL) {
return NULL;
}
if (simd_sequence_fill_obj(list, ptr, dtype) < 0) {
if (simd_sequence_fill_iterable(list, ptr, dtype) < 0) {
Py_DECREF(list);
return NULL;
}
return list;
}

static simd_data
simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype)
simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
// NPYV currently only supports x2 and x3
Expand All @@ -172,7 +175,7 @@ simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype)
for (int i = 0; i < info->is_vectorx; ++i) {
PyObject *item = PyTuple_GET_ITEM(obj, i);
// get the max multi-vec and let the compiler do the rest
data.vu64x3.val[i] = simd_vector_from_obj((simd_vector*)item, info->to_vector).vu64;
data.vu64x3.val[i] = PySIMDVector_AsData((PySIMDVectorObject*)item, info->to_vector).vu64;
if (PyErr_Occurred()) {
return data;
}
Expand All @@ -181,7 +184,7 @@ simd_vectorx_from_obj(PyObject *obj, simd_data_type dtype)
}

static PyObject *
simd_vectorx_to_obj(simd_data data, simd_data_type dtype)
simd_vectorx_to_tuple(simd_data data, simd_data_type dtype)
{
const simd_data_info *info = simd_data_getinfo(dtype);
// NPYV currently only supports x2 and x3
Expand All @@ -194,7 +197,7 @@ simd_vectorx_to_obj(simd_data data, simd_data_type dtype)
for (int i = 0; i < info->is_vectorx; ++i) {
// get the max multi-vector and let the compiler handle the rest
simd_data vdata = {.vu64 = data.vu64x3.val[i]};
PyObject *item = (PyObject*)simd_vector_to_obj(vdata, info->to_vector);
PyObject *item = (PyObject*)PySIMDVector_FromData(vdata, info->to_vector);
if (item == NULL) {
// TODO: improve log add item number
Py_DECREF(tuple);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#if !NPY_SIMD
#error "Not a standalone header, only works through 'simd.dispatch.c.src'"
#endif

/**
* This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
* therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
* symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
* deemed too harmful to readability.
*/
/************************************
** Private Definitions
************************************/
Expand Down
Loading
0