1
1
/* @targets
2
2
* $maxopt baseline
3
- * avx512_skx
3
+ * sse2 sse41 xop avx2 avx512_skx
4
4
* asimd
5
5
*/
6
6
11
11
#define PY_SSIZE_T_CLEAN
12
12
#include < Python.h>
13
13
14
- #include < hwy/highway.h>
15
- #include " hwy/aligned_allocator.h"
16
-
17
14
#include " numpy/ndarraytypes.h"
18
15
#include " numpy/npy_common.h"
19
16
#include " numpy/npy_math.h"
20
17
#include " numpy/utils.h"
21
18
22
- #include " loops_utils.h"
23
19
#include " fast_loop_macros.h"
20
+ #include " loops_utils.h"
21
+
22
+ #include " hwy/aligned_allocator.h"
23
+ #include < hwy/highway.h>
24
24
25
25
namespace numpy {
26
26
namespace HWY_NAMESPACE { // required: unique per target
@@ -30,100 +30,123 @@ namespace hn = hwy::HWY_NAMESPACE;
30
30
31
31
// Alternative to per-function HWY_ATTR: see HWY_BEFORE_NAMESPACE
32
32
template <typename T>
33
- HWY_ATTR void SuperAbsolute (char **args, npy_intp const *dimensions, npy_intp const *steps) {
34
- const T* HWY_RESTRICT input_array = (const T*) args[0 ];
35
- T* HWY_RESTRICT output_array = (T*) args[1 ];
36
- const size_t size = dimensions[0 ];
37
- const hn::ScalableTag<T> d;
38
-
33
+ HWY_ATTR void
34
+ SuperAbsolute (char **args, npy_intp const *dimensions, npy_intp const *steps)
35
+ {
36
+ const T *HWY_RESTRICT input_array = (const T *)args[0 ];
37
+ T *HWY_RESTRICT output_array = (T *)args[1 ];
38
+ const size_t size = dimensions[0 ];
39
+ const hn::ScalableTag<T> d;
40
+
39
41
if (is_mem_overlap (input_array, steps[0 ], output_array, steps[1 ], size)) {
40
- for (size_t i = 0 ; i < size; i++) {
41
- const auto in = hn::LoadN (d, input_array + i, 1 );
42
- auto x = hn::Abs (in);
43
- hn::StoreN (x, d, output_array + i, 1 );
44
- }
45
- } else if (IS_UNARY_CONT (input_array, output_array)) {
46
- size_t full = size & -hn::Lanes (d);
47
- size_t remainder = size - full;
48
- for (size_t i = 0 ; i < full; i += hn::Lanes (d)) {
49
- const auto in = hn::LoadU (d, input_array + i);
50
- auto x = hn::Abs (in);
51
- hn::StoreU (x, d, output_array + i);
52
- }
53
- if (remainder ) {
54
- const auto in = hn::LoadN (d, input_array + full, remainder );
55
- auto x = hn::Abs (in);
56
- hn::StoreN (x, d, output_array + full, remainder );
57
- }
58
- } else {
59
- using TI = hwy::MakeSigned<T>;
60
- const hn::Rebind<TI, hn::ScalableTag<T>> di;
61
-
62
- const int lsize = sizeof (input_array[0 ]);
63
- const npy_intp ssrc = steps[0 ] / lsize;
64
- const npy_intp sdst = steps[1 ] / lsize;
65
- auto load_index = hwy::AllocateAligned<TI>(hn::Lanes (d));
66
- for (size_t i = 0 ; i < hn::Lanes (d); ++i) {
67
- load_index[i] = i * ssrc;
68
- }
69
- auto store_index = hwy::AllocateAligned<TI>(hn::Lanes (d));
70
- for (size_t i = 0 ; i < hn::Lanes (d); ++i) {
71
- store_index[i] = i * sdst;
72
- }
73
-
74
- size_t full = size & -hn::Lanes (d);
75
- size_t remainder = size - full;
76
- for (size_t i = 0 ; i < full; i += hn::Lanes (d)) {
77
- const auto in = hn::GatherIndex (d, input_array + i * ssrc, Load (di, load_index.get ()));
78
- auto x = hn::Abs (in);
79
- hn::ScatterIndex (x, d, output_array + i * sdst, Load (di, store_index.get ()));
80
- }
81
- if (remainder ) {
82
- const auto in = hn::GatherIndexN (d, input_array + full * ssrc, Load (di, load_index.get ()), remainder );
83
- auto x = hn::Abs (in);
84
- hn::ScatterIndexN (x, d, output_array + full * sdst, Load (di, store_index.get ()), remainder );
85
- }
42
+ for (size_t i = 0 ; i < size; i++) {
43
+ const auto in = hn::LoadN (d, input_array + i, 1 );
44
+ auto x = hn::Abs (in);
45
+ hn::StoreN (x, d, output_array + i, 1 );
46
+ }
47
+ }
48
+ else if (IS_UNARY_CONT (input_array, output_array)) {
49
+ size_t full = size & -hn::Lanes (d);
50
+ size_t remainder = size - full;
51
+ for (size_t i = 0 ; i < full; i += hn::Lanes (d)) {
52
+ const auto in = hn::LoadU (d, input_array + i);
53
+ auto x = hn::Abs (in);
54
+ hn::StoreU (x, d, output_array + i);
55
+ }
56
+ if (remainder ) {
57
+ const auto in = hn::LoadN (d, input_array + full, remainder );
58
+ auto x = hn::Abs (in);
59
+ hn::StoreN (x, d, output_array + full, remainder );
60
+ }
61
+ }
62
+ else {
63
+ using TI = hwy::MakeSigned<T>;
64
+ const hn::Rebind<TI, hn::ScalableTag<T>> di;
65
+
66
+ const int lsize = sizeof (input_array[0 ]);
67
+ const npy_intp ssrc = steps[0 ] / lsize;
68
+ const npy_intp sdst = steps[1 ] / lsize;
69
+ auto load_index = hwy::AllocateAligned<TI>(hn::Lanes (d));
70
+ for (size_t i = 0 ; i < hn::Lanes (d); ++i) {
71
+ load_index[i] = i * ssrc;
72
+ }
73
+ auto store_index = hwy::AllocateAligned<TI>(hn::Lanes (d));
74
+ for (size_t i = 0 ; i < hn::Lanes (d); ++i) {
75
+ store_index[i] = i * sdst;
76
+ }
77
+
78
+ size_t full = size & -hn::Lanes (d);
79
+ size_t remainder = size - full;
80
+ for (size_t i = 0 ; i < full; i += hn::Lanes (d)) {
81
+ const auto in = hn::GatherIndex (d, input_array + i * ssrc,
82
+ Load (di, load_index.get ()));
83
+ auto x = hn::Abs (in);
84
+ hn::ScatterIndex (x, d, output_array + i * sdst,
85
+ Load (di, store_index.get ()));
86
+ }
87
+ if (remainder ) {
88
+ const auto in =
89
+ hn::GatherIndexN (d, input_array + full * ssrc,
90
+ Load (di, load_index.get ()), remainder );
91
+ auto x = hn::Abs (in);
92
+ hn::ScatterIndexN (x, d, output_array + full * sdst,
93
+ Load (di, store_index.get ()), remainder );
94
+ }
86
95
}
87
96
}
88
97
89
- HWY_ATTR void INT_SuperAbsolute (char **args, npy_intp const *dimensions, npy_intp const *steps) {
90
- SuperAbsolute<npy_int>(args, dimensions, steps);
98
+ HWY_ATTR void
99
+ INT_SuperAbsolute (char **args, npy_intp const *dimensions,
100
+ npy_intp const *steps)
101
+ {
102
+ SuperAbsolute<npy_int>(args, dimensions, steps);
91
103
}
92
104
93
- HWY_ATTR void DOUBLE_SuperAbsolute (char **args, npy_intp const *dimensions, npy_intp const *steps) {
94
- SuperAbsolute<npy_double>(args, dimensions, steps);
105
+ HWY_ATTR void
106
+ DOUBLE_SuperAbsolute (char **args, npy_intp const *dimensions,
107
+ npy_intp const *steps)
108
+ {
109
+ SuperAbsolute<npy_double>(args, dimensions, steps);
95
110
}
96
111
97
- HWY_ATTR void FLOAT_SuperAbsolute (char **args, npy_intp const *dimensions, npy_intp const *steps) {
98
- SuperAbsolute<npy_float>(args, dimensions, steps);
112
+ HWY_ATTR void
113
+ FLOAT_SuperAbsolute (char **args, npy_intp const *dimensions,
114
+ npy_intp const *steps)
115
+ {
116
+ SuperAbsolute<npy_float>(args, dimensions, steps);
99
117
}
100
118
101
- }
102
- }
119
+ } // namespace HWY_NAMESPACE
120
+ } // namespace numpy
103
121
104
122
namespace numpy {
105
123
106
124
extern " C" {
107
125
108
126
NPY_NO_EXPORT void
109
- NPY_CPU_DISPATCH_CURFX (INT_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func))
127
+ NPY_CPU_DISPATCH_CURFX (INT_absolute)(char **args, npy_intp const *dimensions,
128
+ npy_intp const *steps,
129
+ void *NPY_UNUSED (func))
110
130
{
111
- HWY_STATIC_DISPATCH (INT_SuperAbsolute)(args, dimensions, steps);
131
+ HWY_STATIC_DISPATCH (INT_SuperAbsolute)(args, dimensions, steps);
112
132
}
113
133
114
134
NPY_NO_EXPORT void
115
- NPY_CPU_DISPATCH_CURFX (DOUBLE_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func))
135
+ NPY_CPU_DISPATCH_CURFX (DOUBLE_absolute)(char **args,
136
+ npy_intp const *dimensions,
137
+ npy_intp const *steps,
138
+ void *NPY_UNUSED (func))
116
139
{
117<
E4A0
/code>
- HWY_STATIC_DISPATCH (DOUBLE_SuperAbsolute)(args, dimensions, steps);
140
+ HWY_STATIC_DISPATCH (DOUBLE_SuperAbsolute)(args, dimensions, steps);
118
141
}
119
142
120
143
NPY_NO_EXPORT void
121
- NPY_CPU_DISPATCH_CURFX (FLOAT_absolute)(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func))
144
+ NPY_CPU_DISPATCH_CURFX (FLOAT_absolute)(char **args, npy_intp const *dimensions,
145
+ npy_intp const *steps,
146
+ void *NPY_UNUSED (func))
122
147
{
123
- HWY_STATIC_DISPATCH (FLOAT_SuperAbsolute)(args, dimensions, steps);
148
+ HWY_STATIC_DISPATCH (FLOAT_SuperAbsolute)(args, dimensions, steps);
124
149
}
125
150
126
- } // extern "C"
127
- } // numpy
128
-
129
-
151
+ } // extern "C"
152
+ } // namespace numpy
0 commit comments