8000 Remove leftovers, align targets · numpy/numpy@84695e0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 84695e0

Browse files
committed
Remove leftovers, align targets
1 parent be908f1 commit 84695e0

File tree

3 files changed

+107
-85
lines changed

3 files changed

+107
-85
lines changed

numpy/core/meson.build

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@ endif
7979
# important, because code in NumPy typically does not check the value but only
8080
# whether the symbol is defined. So `#define HAVE_SOMETHING 0` is wrong.
8181

82-
#cmake = import('cmake')
83-
#hwy = cmake.subproject('highway')
84-
8582

8683
cdata = configuration_data()
8784

@@ -812,6 +809,11 @@ umath_gen_headers = [
812809
]
813810

814811
foreach gen_mtargets : [
812+
[
813+
'simd_absolute.dispatch.h',
814+
'src/umath/absolute.dispatch.cpp',
815+
[AVX2, SSE2, ASIMD]
816+
],
815817
[
816818
'loops_arithm_fp.dispatch.h',
817819
src_file.process('src/umath/loops_arithm_fp.dispatch.c.src'),
@@ -915,7 +917,7 @@ foreach gen_mtargets : [
915917
'loops_unary_fp.dispatch.h',
916918
src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
917919
[
918-
SSE41, SSE2,
920+
AVX2, SSE2,
919921
VSX2,
920922
ASIMD, NEON,
921923
VXE, VX
@@ -964,7 +966,8 @@ foreach gen_mtargets : [
964966
'src/common',
965967
'src/multiarray',
966968
'src/npymath',
967-
'src/umath'
969+
'src/umath',
970+
'src/highway',
968971
]
969972
)
970973
if not is_variable('multiarray_umath_mtargets')
@@ -1053,10 +1056,6 @@ src_multiarray = multiarray_gen_headers + [
10531056
'src/multiarray/typeinfo.c',
10541057
'src/multiarray/usertypes.c',
10551058
'src/multiarray/vdot.c',
1056-
src_file.process('src/common/npy_sort.h.src'),
1057-
1058-
'src/npysort/simd_qsort.dispatch.cpp',
1059-
'src/npysort/simd_qsort_16bit.dispatch.cpp',
10601059
'src/npysort/quicksort.cpp',
10611060
'src/npysort/mergesort.cpp',
10621061
'src/npysort/timsort.cpp',
Lines changed: 98 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*@targets
22
* $maxopt baseline
3-
* avx512_skx
3+
* sse2 sse41 xop avx2 avx512_skx
44
* asimd
55
*/
66

@@ -11,16 +11,16 @@
1111
#define PY_SSIZE_T_CLEAN
1212
#include <Python.h>
1313

14-
#include <hwy/highway.h>
15-
#include "hwy/aligned_allocator.h"
16-
1714
#include "numpy/ndarraytypes.h"
1815
#include "numpy/npy_common.h"
1916
#include "numpy/npy_math.h"
2017
#include "numpy/utils.h"
2118

22-
#include "loops_utils.h"
2319
#include "fast_loop_macros.h"
20+
#include "loops_utils.h"
21+
22+
#include "hwy/aligned_allocator.h"
23+
#include <hwy/highway.h>
2424

2525
namespace numpy {
2626
namespace HWY_NAMESPACE { // required: unique per target
@@ -30,100 +30,123 @@ namespace hn = hwy::HWY_NAMESPACE;
3030

3131
// Alternative to per-function HWY_ATTR: see HWY_BEFORE_NAMESPACE
3232
template <typename T>
33-
HWY_ATTR void SuperAbsolute(char **args, npy_intp const *dimensions, npy_intp const *steps) {
34-
const T* HWY_RESTRICT input_array = (const T*) args[0];
35-
T* HWY_RESTRICT output_array = (T*) args[1];
36-
const size_t size = dimensions[0];
37-
const hn::ScalableTag<T> d;
38-
33+
HWY_ATTR void
34+
SuperAbsolute(char **args, npy_intp const *dimensions, npy_intp const *steps)
35+
{
36+
const T *HWY_RESTRICT input_array = (const T *)args[0];
37+
T *HWY_RESTRICT output_array = (T *)args[1];
38+
const size_t size = dimensions[0];
39+
const hn::ScalableTag<T> d;
40+
3941
if (is_mem_overlap(input_array, steps[0], output_array, steps[1], size)) {
40-
for (size_t i = 0; i < size; i++) {
41-
const auto in = hn::LoadN(d, input_array + i, 1);
42-
auto x = hn::Abs(in);
43-
hn::StoreN(x, d, output_array + i, 1);
44-
}
45-
} else if (IS_UNARY_CONT(input_array, output_array)) {
46-
size_t full = size & -hn::Lanes(d);
47-
size_t remainder = size - full;
48-
for (size_t i = 0; i < full; i += hn::Lanes(d)) {
49-
const auto in = hn::LoadU(d, input_array + i);
50-
auto x = hn::Abs(in);
51-
hn::StoreU(x, d, output_array + i);
52-
}
53-
if (remainder) {
54-
const auto in = hn::LoadN(d, input_array + full, remainder);
55-
auto x = hn::Abs(in);
56-
hn::StoreN(x, d, output_array + full, remainder);
57-
}
58-
} else {
59-
using TI = hwy::MakeSigned<T>;
60-
const hn::Rebind<TI, hn::ScalableTag<T>> di;
61-
62-
const int lsize = sizeof(input_array[0]);
63-
const npy_intp ssrc = steps[0] / lsize;
64-
const npy_intp sdst = steps[1] / lsize;
65-
auto load_index = hwy::AllocateAligned<TI>(hn::Lanes(d));
66-
for (size_t i = 0; i < hn::Lanes(d); ++i) {
67-
load_index[i] = i * ssrc;
68-
}
69-
auto store_index = hwy::AllocateAligned<TI>(hn::Lanes(d));
70-
for (size_t i = 0; i < hn::Lanes(d); ++i) {
71-
store_index[i] = i * sdst;
72-
}
73-
74-
size_t full = size & -hn::Lanes(d);
75-
size_t remainder = size - full;
76-
for (size_t i = 0; i < full; i += hn::Lanes(d)) {
77-
const auto in = hn::GatherIndex(d, input_array + i * ssrc, Load(di, load_index.get()));
78-
auto x = hn::Abs(in);
79-
hn::ScatterIndex(x, d, output_array + i * sdst, Load(di, store_index.get()));
80-
}
81-
if (remainder) {
82-
const auto in = hn::GatherIndexN(d, input_array + full * ssrc, Load(di, load_index.get()), remainder);
83-
auto x = hn::Abs(in);
84-
hn::ScatterIndexN(x, d, output_array + full * sdst, Load(di, store_index.get()), remainder);
85-
}
42+
for (size_t i = 0; i < size; i++) {
43+
const auto in = hn::LoadN(d, input_array + i, 1);
44+
auto x = hn::Abs(in);
45+
hn::StoreN(x, d, output_array + i, 1);
46+
}
47+
}
48+
else if (IS_UNARY_CONT(input_array, output_array)) {
49+
size_t full = size & -hn::Lanes(d);
50+
size_t remainder = size - full;
51+
for (size_t i = 0; i < full; i += hn::Lanes(d)) {
52+
const auto in = hn::LoadU(d, input_array + i);
53+
auto x = hn::Abs(in);
54+
hn::StoreU(x, d, output_array + i);
55+
}
56+
if (remainder) {
57+
const auto in = hn::LoadN(d, input_array + full, remainder);
58+
auto x = hn::Abs(in);
59+
hn::StoreN(x, d, output_array + full, remainder);
60+
}
61+
}
62+
else {
63+
using TI = hwy::MakeSigned<T>;
64+
const hn::Rebind<TI, hn::ScalableTag<T>> di;
65+
66+
const int lsize = sizeof(input_array[0]);
67+
const npy_intp ssrc = steps[0] / lsize;
68+
const npy_intp sdst = steps[1] / lsize;
69+
auto load_index = hwy::AllocateAligned<TI>(hn::Lanes(d));
70+
for (size_t i = 0; i < hn::Lanes(d); ++i) {
71+
load_index[i] = i * ssrc;
72+
}
73+
auto store_index = hwy::AllocateAligned<TI>(hn::Lanes(d));
74+
for (size_t i = 0; i < hn::Lanes(d); ++i) {
75+
store_index[i] = i * sdst;
76+
}
77+
78+
size_t full = size & -hn::Lanes(d);
79+
size_t remainder = size - full;
80+
for (size_t i = 0; i < full; i += hn::Lanes(d)) {
81+
const auto in = hn::GatherIndex(d, input_array + i * ssrc,
82+
Load(di, load_index.get()));
83+
auto x = hn::Abs(in);
84+
hn::ScatterIndex(x, d, output_array + i * sdst,
85+
Load(di, store_index.get()));
86+
}
87+
if (remainder) {
88+
const auto in =
89+
hn::GatherIndexN(d, input_array + full * ssrc,
90+
Load(di, load_index.get()), remainder);
91+
auto x = hn::Abs(in);
92+
hn::ScatterIndexN(x, d, output_array + full * sdst,
93+
Load(di, store_index.get()), remainder);
94+
}
8695
}
8796
}
8897

89-
HWY_ATTR void INT_SuperAbsolute(char **args, npy_intp const *dimensions, npy_intp const *steps) {
90-
SuperAbsolute<npy_int>(args, dimensions, steps);
98+
HWY_ATTR void
99+
INT_SuperAbsolute(char **args, npy_intp const *dimensions,
100+
npy_intp const *steps)
101+
{
102+
SuperAbsolute<npy_int>(args, dimensions, steps);
91103
}
92104

93-
HWY_ATTR void DOUBLE_SuperAbsolute(char **args, npy_intp const *dimensions, npy_intp const *steps) {
94-
SuperAbsolute<npy_double>(args, dimensions, steps);
105+
HWY_ATTR void
106+
DOUBLE_SuperAbsolute(char **args, npy_intp const *dimensions,
107+
npy_intp const *steps)
108+
{
109+
SuperAbsolute<npy_double>(args, dimensions, steps);
95110
}
96111

97-
HWY_ATTR void FLOAT_SuperAbsolute(char **args, npy_intp const *dimensions, npy_intp const *steps) {
98-
SuperAbsolute<npy_float>(args, dimensions, steps);
112+
HWY_ATTR void
113+
FLOAT_SuperAbsolute(char **args, npy_intp const *dimensions,
114+
npy_intp const *steps)
115+
{
116+
SuperAbsolute<npy_float>(args, dimensions, steps);
99117
}
100118

101-
}
102-
}
119+
} // namespace HWY_NAMESPACE
120+
} // namespace numpy
103121

104122
namespace numpy {
105123

106124
extern "C" {
107125

108126
NPY_NO_EXPORT void
109-
NPY_CPU_DISPATCH_CURFX(INT_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
127+
NPY_CPU_DISPATCH_CURFX(INT_absolute)(char **args, npy_intp const *dimensions,
128+
npy_intp const *steps,
129+
void *NPY_UNUSED(func))
110130
{
111-
HWY_STATIC_DISPATCH(INT_SuperAbsolute)(args, dimensions, steps);
131+
HWY_STATIC_DISPATCH(INT_SuperAbsolute)(args, dimensions, steps);
112132
}
113133

114134
NPY_NO_EXPORT void
115-
NPY_CPU_DISPATCH_CURFX(DOUBLE_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
135+
NPY_CPU_DISPATCH_CURFX(DOUBLE_absolute)(char **args,
136+
npy_intp const *dimensions,
137+
npy_intp const *steps,
138+
void *NPY_UNUSED(func))
116139
{
117< E4A0 /code>-
HWY_STATIC_DISPATCH(DOUBLE_SuperAbsolute)(args, dimensions, steps);
140+
HWY_STATIC_DISPATCH(DOUBLE_SuperAbsolute)(args, dimensions, steps);
118141
}
119142

120143
NPY_NO_EXPORT void
121-
NPY_CPU_DISPATCH_CURFX(FLOAT_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
144+
NPY_CPU_DISPATCH_CURFX(FLOAT_absolute)(char **args, npy_intp const *dimensions,
145+
npy_intp const *steps,
146+
void *NPY_UNUSED(func))
122147
{
123-
HWY_STATIC_DISPATCH(FLOAT_SuperAbsolute)(args, dimensions, steps);
148+
HWY_STATIC_DISPATCH(FLOAT_SuperAbsolute)(args, dimensions, steps);
124149
}
125150

126-
} // extern "C"
127-
} // numpy
128-
129-
151+
} // extern "C"
152+
} // namespace numpy

numpy/core/src/umath/loops_unary_fp.dispatch.c.src

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*@targets
22
** $maxopt baseline
3-
** sse2 sse41
3+
** sse2 avx2
44
** vsx2
55
** neon asimd
66
** vx vxe

0 commit comments

Comments
 (0)
0