8000 ENH: add neon intrinsics to fft radix2&4 by DumbMice · Pull Request #17231 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: add neon intrinsics to fft radix2&4 #17231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions benchmarks/benchmarks/bench_fft.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from .common import Benchmark

import numpy as np

class FFT(Benchmark):
params = [2**x for x in range(6,26)]
param_names = ['size']

def setup(self,size):
self.x1=np.ones(size,dtype=np.complex)
self.x1+=1j
self.x2=np.ones((size,4),dtype=np.complex)
self.x2+=1j

def time_1dfft(self,size):
np.fft.fft(self.x1)

def time_1difft(self,size):
np.fft.ifft(self.x1)

def time_2dfft(self,size):
np.fft.fft2(self.x2)

def time_2difft(self,size):
np.fft.ifft2(self.x2)
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx2/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "interleave.h"
27 changes: 27 additions & 0 deletions numpy/core/src/common/simd/avx2/interleave.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#include "memory.h" // load*
#include "reorder.h" // npyv_zip*

#ifndef _NPY_SIMD_AVX2_INTERLEAVE_H
#define _NPY_SIMD_AVX2_INTERLEAVE_H

NPY_FINLINE npyv_f64x2 npyv_load_deinterleave_f64x2(const double *ptr)
{
npyv_f64 ab0 = npyv_load_f64(ptr);
npyv_f64 ab1 = npyv_load_f64(ptr + npyv_nlanes_f64);
npyv_f64x2 ab, swap_halves = npyv_combine_f64(ab0, ab1);
ab.val[0] = _mm256_unpacklo_pd(swap_halves.val[0], swap_halves.val[1]);
ab.val[1] = _mm256_unpackhi_pd(swap_halves.val[0], swap_halves.val[1]);
return ab;
}

NPY_FINLINE void npyv_store_interleave_f64x2(double *ptr, npyv_f64x2 a)
{
npyv_f64x2 zip = npyv_zip_f64(a.val[0], a.val[1]);
npyv_store_f64(ptr, zip.val[0]);
npyv_store_f64(ptr + npyv_nlanes_f64, zip.val[1]);
}
#endif // _NPY_SIMD_AVX2_INTERLEAVE_H
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/avx512/avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "interleave.h"
27 changes: 27 additions & 0 deletions numpy/core/src/common/simd/avx512/interleave.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#include "memory.h" // load*
#include "reorder.h" // npyv_zip*

#ifndef _NPY_SIMD_AVX512_INTERLEAVE_H
#define _NPY_SIMD_AVX512_INTERLEAVE_H

NPY_FINLINE npyv_f64x2 npyv_load_deinterleave_f64x2(const double *ptr)
{
npyv_f64 ab0 = npyv_load_f64(ptr);
npyv_f64 ab1 = npyv_load_f64(ptr + npyv_nlanes_f64);
npyv_f64x2 ab;
ab.val[0] = _mm512_permutex2var_pd(ab0, _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14), ab1);
ab.val[1] = _mm512_permutex2var_pd(ab0, _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15), ab1);
return ab;
}

NPY_FINLINE void npyv_store_interleave_f64x2(double *ptr, npyv_f64x2 a)
{
npyv_f64x2 zip = npyv_zip_f64(a.val[0], a.val[1]);
npyv_store_f64(ptr, zip.val[0]);
npyv_store_f64(ptr + npyv_nlanes_f64, zip.val[1]);
}
#endif // _NPY_SIMD_AVX512_INTERLEAVE_H
16 changes: 16 additions & 0 deletions numpy/core/src/common/simd/neon/interleave.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#ifndef _NPY_SIMD_NEON_INTERLEAVE_H
#define _NPY_SIMD_NEON_INTERLEAVE_H

#if NPY_SIMD_F64
NPY_FINLINE npyv_f64x2 npyv_load_deinterleave_f64x2(const double *ptr)
{ return vld2q_f64(ptr); }

NPY_FINLINE void npyv_store_interleave_f64x2(double *ptr, npyv_f64x2 a)
{ vst2q_f64(ptr, a); }
#endif // NPY_SIMD_F64

#endif
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/neon/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "interleave.h"
24 changes: 24 additions & 0 deletions numpy/core/src/common/simd/sse/interleave.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#include "memory.h" // load*
#include "reorder.h" // npyv_zip*

#ifndef _NPY_SIMD_SSE_INTERLEAVE_H
#define _NPY_SIMD_SSE_INTERLEAVE_H

NPY_FINLINE npyv_f64x2 npyv_load_deinterleave_f64x2(const double *ptr)
{
npyv_f64 a = npyv_load_f64(ptr);
npyv_f64 b = npyv_load_f64(ptr + npyv_nlanes_f64);
return npyv_zip_f64(a, b);
}

NPY_FINLINE void npyv_store_interleave_f64x2(double *ptr, npyv_f64x2 a)
{
npyv_f64x2 zip = npyv_zip_f64(a.val[0], a.val[1]);
npyv_store_f64(ptr, zip.val[0]);
npyv_store_f64(ptr + npyv_nlanes_f64, zip.val[1]);
}
#endif // _NPY_SIMD_SSE_INTERLEAVE_H
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/sse/sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "interleave.h"
24 changes: 24 additions & 0 deletions numpy/core/src/common/simd/vsx/interleave.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef NPY_SIMD
#error "Not a standalone header"
#endif

#include "memory.h" // load*
#include "reorder.h" // npyv_zip*

#ifndef _NPY_SIMD_VSX_INTERLEAVE_H
#define _NPY_SIMD_VSX_INTERLEAVE_H

NPY_FINLINE npyv_f64x2 npyv_load_deinterleave_f64x2(const double *ptr)
{
npyv_f64 a = npyv_load_f64(ptr);
npyv_f64 b = npyv_load_f64(ptr + npyv_nlanes_f64);
return npyv_zip_f64(a, b);
}

NPY_FINLINE void npyv_store_interleave_f64x2(double *ptr, npyv_f64x2 a)
{
npyv_f64x2 zip = npyv_zip_f64(a.val[0], a.val[1]);
npyv_store_f64(ptr, zip.val[0]);
npyv_store_f64(ptr + npyv_nlanes_f64, zip.val[1]);
}
#endif // _NPY_SIMD_VSX_INTERLEAVE_H
1 change: 1 addition & 0 deletions numpy/core/src/common/simd/vsx/vsx.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
#include "operators.h"
#include "conversion.h"
#include "arithmetic.h"
#include "interleave.h"
150 changes: 146 additions & 4 deletions numpy/fft/_pocketfft.c
A8B6
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
#include <stdlib.h>

#include "npy_config.h"

#include "simd/simd.h"

#define restrict NPY_RESTRICT

#define RALLOC(type,num) \
Expand Down Expand Up @@ -321,7 +324,29 @@ NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
for (size_t k=0; k<l1; ++k)
{
PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
for (size_t i=1; i<ido; ++i)
size_t i=1;
#if NPY_SIMD_F64
size_t step=NPY_SIMD_WIDTH/sizeof(double);
for (; i+step<=ido; i+=step)
{
npyv_f64x2 c, d, tmp, twd;
c = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((0)+cdim*(k))]);
d = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((1)+cdim*(k))]);
tmp.val[0] = npyv_add_f64(c.val[0], d.val[0]);
tmp.val[1] = npyv_add_f64(c.val[1], d.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(0))], tmp);
tmp.val[0] = npyv_sub_f64(c.val[0], d.val[0]);
tmp.val[1] = npyv_sub_f64(c.val[1], d.val[1]);
twd = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(0)*(ido-1)]);
c.val[0] = npyv_mul_f64(twd.val[0], tmp.val[0]);
c.val[1] = npyv_mul_f64(twd.val[0], tmp.val[1]);
c.val[0] = npyv_nmuladd_f64(twd.val[1], tmp.val[1], c.val[0]);
c.val[1] = npyv_muladd_f64(twd.val[1], tmp.val[0], c.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(1))], c);
}
#endif
// scalr operations
for (; i<ido; ++i)
{
cmplx t;
PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
Expand All @@ -342,7 +367,29 @@ NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
for (size_t k=0; k<l1; ++k)
{
PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
for (size_t i=1; i<ido; ++i)
size_t i=1;
#if NPY_SIMD_F64
size_t step=NPY_SIMD_WIDTH/sizeof(double);
10000 for (; i+step<=ido; i+=step)
{
npyv_f64x2 c, d, tmp, twd;
c = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((0)+cdim*(k))]);
d = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((1)+cdim*(k))]);
tmp.val[0] = npyv_add_f64(c.val[0], d.val[0]);
tmp.val[1] = npyv_add_f64(c.val[1], d.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(0))], tmp);
tmp.val[0] = npyv_sub_f64(c.val[0], d.val[0]);
tmp.val[1] = npyv_sub_f64(c.val[1], d.val[1]);
twd = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(0)*(ido-1)]);
c.val[0] = npyv_mul_f64(twd.val[0], tmp.val[0]);
c.val[1] = npyv_mul_f64(twd.val[0], tmp.val[1]);
c.val[0] = npyv_muladd_f64(twd.val[1], tmp.val[1], c.val[0]);
c.val[1] = npyv_nmuladd_f64(twd.val[1], tmp.val[0], c.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(1))], c);
}
#endif
// scalr operations
for (; i<ido; ++i)
{
cmplx t;
PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
Expand Down Expand Up @@ -467,7 +514,54 @@ NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
PMC(CH(0,k,0),CH(0,k,2),t2,t3)
PMC(CH(0,k,1),CH(0,k,3),t1,t4)
}
for (size_t i=1; i<ido; ++i)
size_t i=1;
#if NPY_SIMD_F64
size_t step=NPY_SIMD_WIDTH/sizeof(double);
for (; i+step<=ido; i+=step)
{
npyv_f64x2 c1, c2, c3, c4, t1, t2, t3, t4, wa0, wa1, wa2;
c1 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((0)+cdim*(k))]);
c3 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((2)+cdim*(k))]);
c2 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((1)+cdim*(k))]);
c4 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((3)+cdim*(k))]);
wa0 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(0)*(ido-1)]);
wa1 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(1)*(ido-1)]);
wa2 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(2)*(ido-1)]);
t2.val[0] = npyv_add_f64(c1.val[0], c3.val[0]);
t2.val[1] = npyv_add_f64(c1.val[1], c3.val[1]);
t1.val[0] = npyv_sub_f64(c1.val[0], c3.val[0]);
t1.val[1] = npyv_sub_f64(c1.val[1], c3.val[1]);
t3.val[0] = npyv_add_f64(c2.val[0], c4.val[0]);
t3.val[1] = npyv_add_f64(c2.val[1], c4.val[1]);
t4.val[0] = npyv_sub_f64(c2.val[0], c4.val[0]);
t4.val[1] = npyv_sub_f64(c2.val[1], c4.val[1]);
c1.val[0] = npyv_add_f64(t2.val[0], t3.val[0]);
c1.val[1] = npyv_add_f64(t2.val[1], t3.val[1]);
c3.val[0] = npyv_sub_f64(t2.val[0], t3.val[0]);
c3.val[1] = npyv_sub_f64(t2.val[1], t3.val[1]);
c2.val[0] = npyv_sub_f64(t1.val[0], t4.val[1]);
c2.val[1] = npyv_add_f64(t1.val[1], t4.val[0]);
c4.val[0] = npyv_add_f64(t1.val[0], t4.val[1]);
c4.val[1] = npyv_sub_f64(t1.val[1], t4.val[0]);
t1.val[0] = npyv_mul_f64(wa0.val[0], c2.val[0]);
t1.val[1] = npyv_mul_f64(wa0.val[0], c2.val[1]);
t1.val[0] = npyv_nmuladd_f64(wa0.val[1], c2.val[1], t1.val[0]);
t1.val[1] = npyv_muladd_f64(wa0.val[1], c2.val[0], t1.val[1]);
t2.val[0] = npyv_mul_f64(wa1.val[0], c3.val[0]);
t2.val[1] = npyv_mul_f64(wa1.val[0], c3.val[1]);
t2.val[0] = npyv_nmuladd_f64(wa1.val[1], c3.val[1], t2.val[0]);
t2.val[1] = npyv_muladd_f64(wa1.val[1], c3.val[0], t2.val[1]);
t3.val[0] = npyv_mul_f64(wa2.val[0], c4.val[0]);
t3.val[1] = npyv_mul_f64(wa2.val[0], c4.val[1]);
t3.val[0] = npyv_nmuladd_f64(wa2.val[1], c4.val[1], t3.val[0]);
t3.val[1] = npyv_muladd_f64(wa2.val[1], c4.val[0], t3.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(0))], c1);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(1))], t1);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(2))], t2);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(3))], t3);
}
#endif
for (; i<ido; ++i)
{
cmplx c2, c3, c4, t1, t2, t3, t4;
cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
Expand Down Expand Up @@ -509,7 +603,54 @@ NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
PMC(CH(0,k,0),CH(0,k,2),t2,t3)
PMC (CH(0,k,1),CH(0,k,3),t1,t4)
}
for (size_t i=1; i<ido; ++i)
size_t i=1;
#if NPY_SIMD_F64
size_t step=NPY_SIMD_WIDTH/sizeof(double);
for (; i+step<=ido; i+=step)
{
npyv_f64x2 c1, c2, c3, c4, t1, t2, t3, t4, wa0, wa1, wa2;
c1 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((0)+cdim*(k))]);
c3 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((2)+cdim*(k))]);
c2 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((1)+cdim*(k))]);
c4 = npyv_load_deinterleave_f64x2((double*)&cc[(i)+ido*((3)+cdim*(k))]);
wa0 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(0)*(ido-1)]);
wa1 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(1)*(ido-1)]);
wa2 = npyv_load_deinterleave_f64x2((double*)&wa[(i)-1+(2)*(ido-1)]);
t2.val[0] = npyv_add_f64(c1.val[0], c3.val[0]);
t2.val[1] = npyv_add_f64(c1.val[1], c3.val[1]);
t1.val[0] = npyv_sub_f64(c1.val[0], c3.val[0]);
t1.val[1] = npyv_sub_f64(c1.val[1], c3.val[1]);
t3.val[0] = npyv_add_f64(c2.val[0], c4.val[0]);
t3.val[1] = npyv_add_f64(c2.val[1], c4.val[1]);
t4.val[0] = npyv_sub_f64(c2.val[0], c4.val[0]);
t4.val[1] = npyv_sub_f64(c2.val[1], c4.val[1]);
c1.val[0] = npyv_add_f64(t2.val[0], t3.val[0]);
c1.val[1] = npyv_add_f64(t2.val[1], t3.val[1]);
c3.val[0] = npyv_sub_f64(t2.val[0], t3.val[0]);
c3.val[1] = npyv_sub_f64(t2.val[1], t3.val[1]);
c2.val[0] = npyv_add_f64(t1.val[0], t4.val[1]);
c2.val[1] = npyv_sub_f64(t1.val[1], t4.val[0]);
c4.val[0] = npyv_sub_f64(t1.val[0], t4.val[1]);
c4.val[1] = npyv_add_f64(t1.val[1], t4.val[0]);
t1.val[0] = npyv_mul_f64(wa0.val[0], c2.val[0]);
t1.val[1] = npyv_mul_f64(wa0.val[0], c2.val[1]);
t1.val[0] = npyv_muladd_f64(wa0.val[1], c2.val[1], t1.val[0]);
t1.val[1] = npyv_nmuladd_f64(wa0.val[1], c2.val[0], t1.val[1]);
t2.val[0] = npyv_mul_f64(wa1.val[0], c3.val[0]);
t2.val[1] = npyv_mul_f64(wa1.val[0], c3.val[1]);
t2.val[0] = npyv_muladd_f64(wa1.val[1], c3.val[1], t2.val[0]);
t2.val[1] = npyv_nmuladd_f64(wa1.val[1], c3.val[0], t2.val[1]);
t3.val[0] = npyv_mul_f64(wa2.val[0], c4.val[0]);
t3.val[1] = npyv_mul_f64(wa2.val[0], c4.val[1]);
t3.val[0] = npyv_muladd_f64(wa2.val[1], c4.val[1], t3.val[0]);
t3.val[1] = npyv_nmuladd_f64(wa2.val[1], c4.val[0], t3.val[1]);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(0))], c1);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(1))], t1);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(2))], t2);
npyv_store_interleave_f64x2((double*)&ch[(i)+ido*((k)+l1*(3))], t3);
}
#endif
for (; i<ido; ++i)
{
cmplx c2, c3, c4, t1, t2, t3, t4;
cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
Expand Down Expand Up @@ -954,6 +1095,7 @@ NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], doub
#undef ADDC
#undef PMC

//complex fourier transform
NOINLINE WARN_UNUSED_RESULT
static int cfftp_forward(cfftp_plan plan, double c[], double fct)
{ return pass_all(plan,(cmplx *)c, fct, -1); }
Expand Down
0