23
23
#include " pocketfft/pocketfft_hdronly.h"
24
24
25
25
/*
26
- * Copy all nin elements of input to the first nin of the output,
27
- * and any set any remaining nout- nin output elements to 0
28
- * (if nout < nin, copy only nout) .
26
+ * Transfer to and from a contiguous buffer.
27
+ * copy_input: copy min( nin, n) elements from input to buffer and zero rest.
28
+ * copy_output: copy n elements from buffer to output .
29
29
*/
30
30
template <typename T>
31
31
static inline void
32
- copy_data (char * in, npy_intp step_in, npy_intp nin,
33
- char * out, npy_intp step_out, npy_intp nout )
32
+ copy_input (char * in, npy_intp step_in, size_t nin,
33
+ T buff[], size_t n )
34
34
{
35
- npy_intp ncopy = nin <= nout? nin : nout;
36
- char *op = out;
37
- if (ncopy > 0 ) {
38
- char *ip = in;
39
- for (npy_intp i = 0 ; i < ncopy; i++, ip += step_in, op += step_out) {
40
- *(T *)op = *(T *)ip;
41
- }
42
- }
43
- else {
44
- assert (ncopy == 0 );
35
+ size_t ncopy = nin <= n ? nin : n;
36
+ char *ip = in;
37
+ size_t i;
38
+ for (i = 0 ; i < ncopy; i++, ip += step_in) {
39
+ buff[i] = *(T *)ip;
45
40
}
46
- if (nout > ncopy) {
47
- for (npy_intp i = ncopy; i < nout; i++, op += step_out) {
48
- *(T *)op = 0 ;
49
- }
41
+ for (; i < n; i++) {
42
+ buff[i] = 0 ;
50
43
}
51
44
}
52
45
46
+ template <typename T>
47
+ static inline void
48
+ copy_output (T buff[], char *out, npy_intp step_out, size_t n)
49
+ {
50
+ char *op = out;
51
+ for (size_t i = 0 ; i < n; i++, op += step_out) {
52
+ *(T *)op = buff[i];
53
+ }
54
+ }
53
55
54
56
/*
55
- * Loops calling the pocketfft code.
57
+ * Gufunc loops calling the pocketfft code.
56
58
*/
57
59
template <typename T>
58
60
static void
59
61
fft_loop (char **args, npy_intp const *dimensions, ptrdiff_t const *steps,
60
62
void *func)
61
63
{
62
64
char *ip = args[0 ], *fp = args[1 ], *op = args[2 ];
63
- npy_intp n_outer = dimensions[0 ];
65
+ size_t n_outer = ( size_t ) dimensions[0 ];
64
66
ptrdiff_t si = steps[0 ], sf = steps[1 ], so = steps[2 ];
65
- npy_intp nin = dimensions[1 ], nout = dimensions[2 ];
67
+ size_t nin = ( size_t ) dimensions[1 ], nout = ( size_t ) dimensions[2 ];
66
68
ptrdiff_t step_in = steps[3 ], step_out = steps[4 ];
67
69
bool direction = *((bool *)func); /* pocketfft::FORWARD or BACKWARD */
68
70
69
71
assert (nout > 0 );
70
72
71
- if (nin >= nout && (sf == 0 || n_outer == 1 )) {
72
- /*
73
- * For the common case of nin == nout and fixed factor, we can call
74
- * pocketfft directly, and benefit from vectorization for >1D. For
75
- * nin>nout, this just removes the extra input points, as required.
76
- */
77
- std::vector<size_t > shape = { (size_t ) n_outer, (size_t ) nout };
73
+ #ifndef POCKETFFT_NO_VECTORS
74
+ /*
75
+ * For the common case of nin >= nout, fixed factor, and suitably sized
76
+ * outer loop, we call pocketfft directly to benefit from its vectorization.
77
+ * (For nin>nout, this just removes the extra input points, as required.)
78
+ */
79
+ constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
80
+ if (vlen > 1 && n_outer >= vlen && nin >= nout && sf == 0 ) {
81
+ std::vector<size_t > shape = { n_outer, nout };
78
82
std::vector<ptrdiff_t > strides_in = { si, step_in };
79
83
std::vector<ptrdiff_t > strides_out = { so, step_out};
80
84
std::vector<size_t > axes = { 1 };
81
85
pocketfft::c2c (shape, strides_in, strides_out, axes, direction,
82
86
(std::complex<T> *)ip, (std::complex<T> *)op, *(T *)fp);
87
+ return ;
83
88
}
84
- else {
85
- /*
86
- * Input is short, so copy to output, padding with zeros.
87
- * (also covers variable factor, where copy may not strictly be needed)
88
- */
89
- std::vector<size_t > axes = { 0 };
90
- std::vector<size_t > shape = { (size_t )nout };
91
- std::vector<ptrdiff_t > strides_out = { step_out };
92
- for (npy_intp i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
93
- copy_data<std::complex<T>>(ip, step_in, nin, op, step_out, nout);
94
- pocketfft::c2c (shape, strides_out, strides_out, axes, direction,
95
- (std::complex<T> *)op, (std::complex<T> *)op, *(T *)fp);
89
+ #endif
90
+ /*
91
+ * Otherwise, use a non-vectorized loop in which we try to minimize copies.
92
+ * We do still need a buffer if the output is not contiguous.
93
+ */
94
+ auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_c<T>>(nout);
95
+ char *buff = NULL ;
96
+ if (step_out != sizeof (std::complex<T>)) {
97
+ buff = (char *)malloc (nout * sizeof (std::complex<T>));
98
+ if (buff == NULL ) {
99
+ goto fail;
100
+ }
101
+ }
102
+ for (size_t i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
103
+ std::complex<T> *op_or_buff = (std::complex<T> *)(buff == NULL ? op : buff);
104
+ if (ip != (char *)op_or_buff) {
105
+ copy_input (ip, step_in, nin, op_or_buff, nout);
106
+ }
107
+ plan->exec ((pocketfft::detail::cmplx<T> *)op_or_buff, *(T *)fp, direction);
108
+ if (buff != NULL ) {
109
+ copy_output (op_or_buff, op, step_out, nout);
96
110
}
97
111
}
112
+ free (buff);
113
+ return ;
114
+
115
+ fail:
116
+ /* TODO: Requires use of new ufunc API to indicate error return */
117
+ NPY_ALLOW_C_API_DEF
118
+ NPY_ALLOW_C_API;
119
+ PyErr_NoMemory ();
120
+ NPY_DISABLE_C_API;
121
+ return ;
98
122
}
99
123
100
124
template <typename T>
101
125
static void
102
126
rfft_impl (char **args, npy_intp const *dimensions, npy_intp const *steps,
103
- void *func, npy_intp npts)
127
+ void *func, size_t npts)
104
128
{
105
129
char *ip = args[0 ], *fp = args[1 ], *op = args[2 ];
106
- npy_intp n_outer = dimensions[0 ];
130
+ size_t n_outer = ( size_t ) dimensions[0 ];
107
131
ptrdiff_t si = steps[0 ], sf = steps[1 ], so = steps[2 ];
108
- npy_intp nin = dimensions[1 ], nout = dimensions[2 ];
132
+ size_t nin = ( size_t ) dimensions[1 ], nout = ( size_t ) dimensions[2 ];
109
133
ptrdiff_t step_in = steps[3 ], step_out = steps[4 ];
110
134
111
- assert (nout == npts / 2 + 1 );
135
+ assert (nout > 0 && nout == npts / 2 + 1 );
112
136
113
- if (nin >= npts && (sf == 0 || n_outer == 1 )) {
114
- std::vector<size_t > shape_in = { (size_t ) n_outer, (size_t ) npts };
137
+ #ifndef POCKETFFT_NO_VECTORS
138
+ /*
139
+ * Call pocketfft directly if vectorization is possible.
140
+ */
141
+ constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
142
+ if (vlen > 1 && n_outer >= vlen && nin >= npts && sf == 0 ) {
143
+ std::vector<size_t > shape_in = { n_outer, npts };
115
144
std::vector<ptrdiff_t > strides_in = { si, step_in };
116
145
std::vector<ptrdiff_t > strides_out = { so, step_out};
117
146
std::vector<size_t > axes = { 1 };
118
147
pocketfft::r2c (shape_in, strides_in, strides_out, axes, pocketfft::FORWARD,
119
148
(T *)ip, (std::complex<T> *)op, *(T *)fp);
120
149
return ;
121
150
}
151
+ #endif
122
152
/*
123
- * Input short, so need a padded copy; we'll use out if contiguous .
124
- * We also use internal pocketfft routines to avoid a second copy .
153
+ * Otherwise, use a non-vectorized loop in which we try to minimize copies .
154
+ * We do still need a buffer if the output is not contiguous .
125
155
*/
126
156
auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_r<T>>(npts);
127
157
char *buff = NULL ;
@@ -131,9 +161,8 @@ rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps,
131
161
goto fail;
132
162
}
133
163
}
134
- for (npy_intp i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
135
- char *op_or_buff = buff == NULL ? op : buff;
136
- T *op_T = (T *)op_or_buff;
164
+ for (size_t i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
165
+ std::complex<T> *op_or_buff = (std::complex<T> *)(buff == NULL ? op : buff);
137
166
/*
138
167
* The internal pocketfft routines work in-place and for real
139
168
* transforms the frequency data thus needs to be compressed, using
@@ -146,15 +175,11 @@ rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps,
146
175
* create I0=0. Note that copy_data will zero the In component for
147
176
* even number of points.
148
177
*/
149
- copy_data<T>(ip, step_in, nin,
150
- (char *)&op_T[1 ], sizeof (T), nout*2 - 1 );
151
- plan->exec (&op_T[1 ], *(T *)fp, pocketfft::FORWARD);
152
- op_T[0 ] = op_T[1 ];
153
- op_T[1 ] = (T)0 ;
154
- if (op_or_buff == buff) {
155
- copy_data<std::complex<T>>
156
- (op_or_buff, sizeof (std::
F42D
complex<T>), nout,
157
- op, step_out, nout);
178
+ copy_input (ip, step_in, nin, &((T *)op_or_buff)[1 ], nout*2 - 1 );
179
+ plan->exec (&((T *)op_or_buff)[1 ], *(T *)fp, pocketfft::FORWARD);
180
+ op_or_buff[0 ] = op_or_buff[0 ].imag (); // I0->R0, I0=0
181
+ if (buff != NULL ) {
182
+ copy_output (op_or_buff, op, step_out, nout);
158
183
}
159
184
}
160
185
free (buff);
@@ -179,15 +204,19 @@ template <typename T>
179
204
static void
180
205
rfft_n_even_loop (char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
181
206
{
182
- npy_intp npts = 2 * dimensions[2 ] - 2 ;
207
+ size_t nout = (size_t )dimensions[2 ];
208
+ assert (nout > 0 );
209
+ size_t npts = 2 * nout - 2 ;
183
210
rfft_impl<T>(args, dimensions, steps, func, npts);
184
211
}
185
212
186
213
template <typename T>
187
214
static void
188
215
rfft_n_odd_loop (char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
189
216
{
190
- npy_intp npts = 2 * dimensions[2 ] - 1 ;
217
+ size_t nout = (size_t )dimensions[2 ];
218
+ assert (nout > 0 );
219
+ size_t npts = 2 * nout - 1 ;
191
220
rfft_impl<T>(args, dimensions, steps, func, npts);
192
221
}
193
222
@@ -196,27 +225,33 @@ static void
196
225
irfft_loop (char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
197
226
{
198
227
char *ip = args[0 ], *fp = args[1 ], *op = args[2 ];
199
- npy_intp n_outer = dimensions[0 ];
228
+ size_t n_outer = ( size_t ) dimensions[0 ];
200
229
ptrdiff_t si = steps[0 ], sf = steps[1 ], so = steps[2 ];
201
- npy_intp nin = dimensions[1 ], nout = dimensions[2 ];
230
+ size_t nin = ( size_t ) dimensions[1 ], nout = ( size_t ) dimensions[2 ];
202
231
ptrdiff_t step_in = steps[3 ], step_out = steps[4 ];
203
232
204
- npy_intp npts_in = nout / 2 + 1 ;
233
+ size_t npts_in = nout / 2
179B
+ 1 ;
205
234
206
235
assert (nout > 0 );
207
236
208
- if (nin >= npts_in && (sf == 0 || n_outer == 1 )) {
237
+ #ifndef POCKETFFT_NO_VECTORS
238
+ /*
239
+ * Call pocketfft directly if vectorization is possible.
240
+ */
241
+ constexpr auto vlen = pocketfft::detail::VLEN<T>::val;
242
+ if (vlen > 1 && n_outer >= vlen && nin >= npts_in && sf == 0 ) {
209
243
std::vector<size_t > axes = { 1 };
210
- std::vector<size_t > shape_out = { ( size_t ) n_outer, ( size_t ) nout };
244
+ std::vector<size_t > shape_out = { n_outer, nout };
211
245
std::vector<ptrdiff_t > strides_in = { si, step_in };
212
246
std::vector<ptrdiff_t > strides_out = { so, step_out};
213
247
pocketfft::c2r (shape_out, strides_in, strides_out, axes, pocketfft::BACKWARD,
214
248
(std::complex<T> *)ip, (T *)op, *(T *)fp);
215
249
return ;
216
250
}
251
+ #endif
217
252
/*
218
- * Input short, so make a padded copy; we want to use out if possible,
219
- * so go directly to FFTpack format which has the same number of bytes .
253
+ * Otherwise, use a non-vectorized loop in which we try to minimize copies.
254
+ * We do still need a buffer if the output is not contiguous .
220
255
*/
221
256
auto plan = pocketfft::detail::get_plan<pocketfft::detail::pocketfft_r<T>>(nout);
222
257
char *buff = NULL ;
@@ -226,9 +261,8 @@ irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void
226
261
goto fail;
227
262
}
228
263
}
229
- for (npy_intp i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
230
- char *op_or_buff = buff == NULL ? op : buff;
231
- T *op_T = (T *)op_or_buff;
264
+ for (size_t i = 0 ; i < n_outer; i++, ip += si, fp += sf, op += so) {
265
+ T *op_or_buff = (T *)(buff == NULL ? op : buff);
232
266
/*
233
267
* Pocket_fft works in-place and for inverse real transforms the
234
268
* frequency data thus needs to be compressed, removing the imaginary
@@ -238,26 +272,24 @@ irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void
238
272
* the data to the buffer in the following order (also used by
239
273
* FFTpack): R0,R1,I1,...Rn-1,In-1,Rn[,In] (last for npts odd only).
240
274
*/
241
- op_T [0 ] = ((T *)ip)[0 ]; /* copy R0 */
275
+ op_or_buff [0 ] = ((T *)ip)[0 ]; /* copy R0 */
242
276
if (nout > 1 ) {
243
277
/*
244
278
* Copy R1,I1... up to Rn-1,In-1 if possible, stopping earlier
245
279
* if not all the input points are needed or if the input is short
246
280
* (in the latter case, zeroing after).
247
281
*/
248
- copy_data<std::complex<T>>
249
- (ip + step_in, step_in, nin - 1 ,
250
- (char *)&op_T[1 ], sizeof (std::complex<T>), (nout - 1 ) / 2 );
282
+ copy_input (ip + step_in, step_in, nin - 1 ,
283
+ (std::complex<T> *)&op_or_buff[1 ], (nout - 1 ) / 2 );
251
284
/* For even nout, we still need to set Rn. */
252
285
if (nout % 2 == 0 ) {
253
- op_T [nout - 1 ] = (nout / 2 >= nin) ? (T)0 :
286
+ op_or_buff [nout - 1 ] = (nout / 2 >= nin) ? (T)0 :
254
287
((T *)(ip + (nout / 2 ) * step_in))[0 ];
255
288
}
256
289
}
257
- plan->exec (op_T, *(T *)fp, pocketfft::BACKWARD);
258
- if (op_or_buff == buff) {
259
- copy_data<T>(op_or_buff, sizeof (T), nout,
260
- op, step_out, nout);
290
+ plan->exec (op_or_buff, *(T *)fp, pocketfft::BACKWARD);
291
+ if (buff != NULL ) {
292
+ copy_output (op_or_buff, op, step_out, nout);
261
293
}
262
294
}
263
295
free (buff);
0 commit comments