8000 MAINT: make every part use copy_input and copy_output · numpy/numpy@aa97df0 · GitHub
[go: up one dir, main page]

Skip to content

Commit aa97df0

Browse files
committed
MAINT: make every part use copy_input and copy_output
1 parent 542f2e6 commit aa97df0

File tree

1 file changed

+114
-82
lines changed

1 file changed

+114
-82
lines changed

numpy/fft/_pocketfft_umath.cpp

Lines changed: 114 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -23,105 +23,135 @@
2323
#include "pocketfft/pocketfft_hdronly.h"
2424

2525
/*
26-
* Copy all nin elements of input to the first nin of the output,
27-
* and any set any remaining nout-nin output elements to 0
28-
* (if nout < nin, copy only nout).
26+
* Transfer to and from a contiguous buffer.
27+
* copy_input: copy min(nin, n) elements from input to buffer and zero rest.
28+
* copy_output: copy n elements from buffer to output.
2929
*/
3030
template <typename T>
3131
static inline void
32-
copy_data(char* in, npy_intp step_in, npy_intp nin,
33-
char* out, npy_intp step_out, npy_intp nout)
32+
copy_input(char *in, npy_intp step_in, size_t nin,
33+
T buff[], size_t n)
3434
{
35-
npy_intp ncopy = nin <= nout? nin : nout;
36-
char *op = out;
37-
if (ncopy > 0) {
38-
char *ip = in;
39-
for (npy_intp i = 0; i < ncopy; i++, ip += step_in, op += step_out) {
40-
*(T *)op = *(T *)ip;
41-
}
42-
}
43-
else {
44-
assert(ncopy == 0);
35+
size_t ncopy = nin <= n ? nin : n;
36+
char *ip = in;
37+
size_t i;
38+
for (i = 0; i < ncopy; i++, ip += step_in) {
39+
buff[i] = *(T *)ip;
4540
}
46-
if (nout > ncopy) {
47-
for (npy_intp i = ncopy; i < nout; i++, op += step_out) {
48-
*(T *)op = 0;
49-
}
41+
for (; i < n; i++) {
42+
buff[i] = 0;
5043
}
5144
}
5245

46+
template <typename T>
47+
static inline void
48+
copy_output(T buff[], char *out, npy_intp step_out, size_t n)
49+
{
50+
char *op = out;
51+
for (size_t i = 0; i < n; i++, op += step_out) {
52+
*(T *)op = buff[i];
53+
}
54+
}
5355

5456
/*
55-
* Loops calling the pocketfft code.
57+
* Gufunc loops calling the pocketfft code.
5658
*/
5759
template <typename T>
5860
static void
5961
fft_loop(char **args, npy_intp const *dimensions, ptrdiff_t const *steps,
6062
void *func)
6163
{
6264
char *ip = args[0], *fp = args[1], *op = args[2];
63-
npy_intp n_outer = dimensions[0];
65+
size_t n_outer = (size_t)dimensions[0];
6466
ptrdiff_t si = steps[0], sf = steps[1], so = steps[2];
65-
npy_intp nin = dimensions[1], nout = dimensions[2];
67+
size_t nin = (size_t)dimensions[1], nout = (size_t)dimensions[2];
6668
ptrdiff_t step_in = steps[3], step_out = steps[4];
6769
bool direction = *((bool *)func); /* pocketfft::FORWARD or BACKWARD */
6870

6971
assert (nout > 0);
7072

71-
if (nin >= nout && (sf == 0 || n_outer == 1)) {
72-
/*
73-
* For the common case of nin == nout and fixed factor, we can call
74-
* pocketfft directly, and benefit from vectorization for >1D. For
75-
* nin>nout, this just removes the extra input points, as required.
76-
*/
77-
std::vector<size_t> shape = { (size_t) n_outer, (size_t) nout };
73+
#ifndef POCKETFFT_NO_VECTORS
74+
/*
75+
* For the common case of nin >= nout, fixed factor, and suitably sized
76+
* outer loop, we call pocketfft directly to benefit from its vectorization.
77+
* (For nin>nout, this just removes the extra input points, as required.)
78+
*/
79+
constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
80+
if (vlen > 1 && n_outer >= vlen && nin >= nout && sf == 0) {
81+
std::vector<size_t> shape = { n_outer, nout };
7882
std::vector<ptrdiff_t> strides_in = { si, step_in };
7983
std::vector<ptrdiff_t> strides_out = { so, step_out};
8084
std::vector<size_t> axes = { 1 };
8185
pocketfft::c2c(shape, strides_in, strides_out, axes, direction,
8286
(std::complex<T> *)ip, (std::complex<T> *)op, *(T *)fp);
87+
return;
8388
}
84-
else {
85-
/*
86-
* Input is short, so copy to output, padding with zeros.
87-
* (also covers variable factor, where copy may not strictly be needed)
88-
*/
89-
std::vector<size_t> axes = { 0 };
90-
std::vector<size_t> shape = { (size_t)nout };
91-
std::vector<ptrdiff_t> strides_out = { step_out };
92-
for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
93-
copy_data<std::complex<T>>(ip, step_in, nin, op, step_out, nout);
94-
pocketfft::c2c(shape, strides_out, strides_out, axes, direction,
95-
(std::complex<T> *)op, (std::complex<T> *)op, *(T *)fp);
89+
#endif
90+
/*
91+
* Otherwise, use a non-vectorized loop in which we try to minimize copies.
92+
* We do still need a buffer if the output is not contiguous.
93+
*/
94+
auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_c<T>>(nout);
95+
char *buff = NULL;
96+
if (step_out != sizeof(std::complex<T>)) {
97+
buff = (char *)malloc(nout * sizeof(std::complex<T>));
98+
if (buff == NULL) {
99+
goto fail;
100+
}
101+
}
102+
for (size_t i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
103+
std::complex<T> *op_or_buff = (std::complex<T> *)(buff == NULL ? op : buff);
104+
if (ip != (char*)op_or_buff) {
105+
copy_input(ip, step_in, nin, op_or_buff, nout);
106+
}
107+
plan->exec((pocketfft::detail::cmplx<T> *)op_or_buff, *(T *)fp, direction);
108+
if (buff != NULL) {
109+
copy_output(op_or_buff, op, step_out, nout);
96110
}
97111
}
112+
free(buff);
113+
return;
114+
115+
fail:
116+
/* TODO: Requires use of new ufunc API to indicate error return */
117+
NPY_ALLOW_C_API_DEF
118+
NPY_ALLOW_C_API;
119+
PyErr_NoMemory();
120+
NPY_DISABLE_C_API;
121+
return;
98122
}
99123

100124
template <typename T>
101125
static void
102126
rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps,
103-
void *func, npy_intp npts)
127+
void *func, size_t npts)
104128
{
105129
char *ip = args[0], *fp = args[1], *op = args[2];
106-
npy_intp n_outer = dimensions[0];
130+
size_t n_outer = (size_t)dimensions[0];
107131
ptrdiff_t si = steps[0], sf = steps[1], so = steps[2];
108-
npy_intp nin = dimensions[1], nout = dimensions[2];
132+
size_t nin = (size_t)dimensions[1], nout = (size_t)dimensions[2];
109133
ptrdiff_t step_in = steps[3], step_out = steps[4];
110134

111-
assert (nout == npts / 2 + 1);
135+
assert (nout > 0 && nout == npts / 2 + 1);
112136

113-
if (nin >= npts && (sf == 0 || n_outer == 1)) {
114-
std::vector<size_t> shape_in = { (size_t) n_outer, (size_t) npts };
137+
#ifndef POCKETFFT_NO_VECTORS
138+
/*
139+
* Call pocketfft directly if vectorization is possible.
140+
*/
141+
constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
142+
if (vlen > 1 && n_outer >= vlen && nin >= npts && sf == 0) {
143+
std::vector<size_t> shape_in = { n_outer, npts };
115144
std::vector<ptrdiff_t> strides_in = { si, step_in };
116145
std::vector<ptrdiff_t> strides_out = { so, step_out};
117146
std::vector<size_t> axes = { 1 };
118147
pocketfft::r2c(shape_in, strides_in, strides_out, axes, pocketfft::FORWARD,
119148
(T *)ip, (std::complex<T> *)op, *(T *)fp);
120149
return;
121150
}
151+
#endif
122152
/*
123-
* Input short, so need a padded copy; we'll use out if contiguous.
124-
* We also use internal pocketfft routines to avoid a second copy.
153+
* Otherwise, use a non-vectorized loop in which we try to minimize copies.
154+
* We do still need a buffer if the output is not contiguous.
125155
*/
126156
auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_r<T>>(npts);
127157
char *buff = NULL;
@@ -131,9 +161,8 @@ rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps,
131161
goto fail;
132162
}
133163
}
134-
for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
135-
char *op_or_buff = buff == NULL ? op : buff;
136-
T *op_T = (T *)op_or_buff;
164+
for (size_t i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
165+
std::complex<T> *op_or_buff = (std::complex<T> *)(buff == NULL ? op : buff);
137166
/*
138167
* The internal pocketfft routines work in-place and for real
139168
* transforms the frequency data thus needs to be compressed, using
@@ -146,15 +175,11 @@ rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps,
146175
* create I0=0. Note that copy_data will zero the In component for
147176
* even number of points.
148177
*/
149-
copy_data<T>(ip, step_in, nin,
150-
(char *)&op_T[1], sizeof(T), nout*2 - 1);
151-
plan->exec(&op_T[1], *(T *)fp, pocketfft::FORWARD);
152-
op_T[0] = op_T[1];
153-
op_T[1] = (T)0;
154-
if (op_or_buff == buff) {
155-
copy_data<std::complex<T>>
156-
(op_or_buff, sizeof(std:: F42D complex<T>), nout,
157-
op, step_out, nout);
178+
copy_input(ip, step_in, nin, &((T *)op_or_buff)[1], nout*2 - 1);
179+
plan->exec(&((T *)op_or_buff)[1], *(T *)fp, pocketfft::FORWARD);
180+
op_or_buff[0] = op_or_buff[0].imag(); // I0->R0, I0=0
181+
if (buff != NULL) {
182+
copy_output(op_or_buff, op, step_out, nout);
158183
}
159184
}
160185
free(buff);
@@ -179,15 +204,19 @@ template <typename T>
179204
static void
180205
rfft_n_even_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
181206
{
182-
npy_intp npts = 2 * dimensions[2] - 2;
207+
size_t nout = (size_t)dimensions[2];
208+
assert (nout > 0);
209+
size_t npts = 2 * nout - 2;
183210
rfft_impl<T>(args, dimensions, steps, func, npts);
184211
}
185212

186213
template <typename T>
187214
static void
188215
rfft_n_odd_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
189216
{
190-
npy_intp npts = 2 * dimensions[2] - 1;
217+
size_t nout = (size_t)dimensions[2];
218+
assert (nout > 0);
219+
size_t npts = 2 * nout - 1;
191220
rfft_impl<T>(args, dimensions, steps, func, npts);
192221
}
193222

@@ -196,27 +225,33 @@ static void
196225
irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
197226
{
198227
char *ip = args[0], *fp = args[1], *op = args[2];
199-
npy_intp n_outer = dimensions[0];
228+
size_t n_outer = (size_t)dimensions[0];
200229
ptrdiff_t si = steps[0], sf = steps[1], so = steps[2];
201-
npy_intp nin = dimensions[1], nout = dimensions[2];
230+
size_t nin = (size_t)dimensions[1], nout = (size_t)dimensions[2];
202231
ptrdiff_t step_in = steps[3], step_out = steps[4];
203232

204-
npy_intp npts_in = nout / 2 + 1;
233+
size_t npts_in = nout / 2 179B + 1;
205234

206235
assert(nout > 0);
207236

208-
if (nin >= npts_in && (sf == 0 || n_outer == 1)) {
237+
#ifndef POCKETFFT_NO_VECTORS
238+
/*
239+
* Call pocketfft directly if vectorization is possible.
240+
*/
241+
constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
242+
if (vlen > 1 && n_outer >= vlen && nin >= npts_in && sf == 0) {
209243
std::vector<size_t> axes = { 1 };
210-
std::vector<size_t> shape_out = { (size_t) n_outer, (size_t) nout };
244+
std::vector<size_t> shape_out = { n_outer, nout };
211245
std::vector<ptrdiff_t> strides_in = { si, step_in };
212246
std::vector<ptrdiff_t> strides_out = { so, step_out};
213247
pocketfft::c2r(shape_out, strides_in, strides_out, axes, pocketfft::BACKWARD,
214248
(std::complex<T> *)ip, (T *)op, *(T *)fp);
215249
return;
216250
}
251+
#endif
217252
/*
218-
* Input short, so make a padded copy; we want to use out if possible,
219-
* so go directly to FFTpack format which has the same number of bytes.
253+
* Otherwise, use a non-vectorized loop in which we try to minimize copies.
254+
* We do still need a buffer if the output is not contiguous.
220255
*/
221256
auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_r<T>>(nout);
222257
char *buff = NULL;
@@ -226,9 +261,8 @@ irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void
226261
goto fail;
227262
}
228263
}
229-
for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
230-
char *op_or_buff = buff == NULL ? op : buff;
231-
T *op_T = (T *)op_or_buff;
264+
for (size_t i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
265+
T *op_or_buff = (T *)(buff == NULL ? op : buff);
232266
/*
233267
* Pocket_fft works in-place and for inverse real transforms the
234268
* frequency data thus needs to be compressed, removing the imaginary
@@ -238,26 +272,24 @@ irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void
238272
* the data to the buffer in the following order (also used by
239273
* FFTpack): R0,R1,I1,...Rn-1,In-1,Rn[,In] (last for npts odd only).
240274
*/
241-
op_T[0] = ((T *)ip)[0]; /* copy R0 */
275+
op_or_buff[0] = ((T *)ip)[0]; /* copy R0 */
242276
if (nout > 1) {
243277
/*
244278
* Copy R1,I1... up to Rn-1,In-1 if possible, stopping earlier
245279
* if not all the input points are needed or if the input is short
246280
* (in the latter case, zeroing after).
247281
*/
248-
copy_data<std::complex<T>>
249-
(ip + step_in, step_in, nin - 1,
250-
(char *)&op_T[1], sizeof(std::complex<T>), (nout - 1) / 2);
282+
copy_input(ip + step_in, step_in, nin - 1,
283+
(std::complex<T> *)&op_or_buff[1], (nout - 1) / 2);
251284
/* For even nout, we still need to set Rn. */
252285
if (nout % 2 == 0) {
253-
op_T[nout - 1] = (nout / 2 >= nin) ? (T)0 :
286+
op_or_buff[nout - 1] = (nout / 2 >= nin) ? (T)0 :
254287
((T *)(ip + (nout / 2) * step_in))[0];
255288
}
256289
}
257-
plan->exec(op_T, *(T *)fp, pocketfft::BACKWARD);
258-
if (op_or_buff == buff) {
259-
copy_data<T>(op_or_buff, sizeof(T), nout,
260-
op, step_out, nout);
290+
plan->exec(op_or_buff, *(T *)fp, pocketfft::BACKWARD);
291+
if (buff != NULL) {
292+
copy_output(op_or_buff, op, step_out, nout);
261293
}
262294
}
263295
free(buff);

0 commit comments

Comments
 (0)
0