diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h index c99d628ee408..0641f2314ec6 100644 --- a/numpy/core/src/common/simd/avx2/avx2.h +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -5,6 +5,8 @@ #define NPY_SIMD 256 #define NPY_SIMD_WIDTH 32 #define NPY_SIMD_F64 1 +// Enough limit to allow us to use _mm256_i32gather_* +#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8) typedef __m256i npyv_u8; typedef __m256i npyv_s8; diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h index 5ea7414fdf5c..e27bf15fec2e 100644 --- a/numpy/core/src/common/simd/avx2/memory.h +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -2,6 +2,8 @@ #error "Not a standalone header" #endif +#include "misc.h" + #ifndef _NPY_SIMD_AVX2_MEMORY_H #define _NPY_SIMD_AVX2_MEMORY_H @@ -66,5 +68,289 @@ NPYV_IMPL_AVX2_MEM_INT(npy_int64, s64) // store higher part #define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1)) #define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1)) +/*************************** + * Non-contiguous Load + ***************************/ +//// 32 +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ + assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32); + const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps); + return _mm256_i32gather_epi32((const int*)ptr, idx, 4); +} +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); } +//// 64 +#if 0 // slower +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ + const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride); + return _mm256_i64gather_epi64((const void*)ptr, idx, 8); +} +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); } +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } +#endif +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ + __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)); + __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))); + __m128d a01 = _mm_loadh_pd(a0, ptr + stride); + __m128d a23 = _mm_loadh_pd(a2, ptr + stride*3); + return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1); +} +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); } +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ + __m128i a0 = _mm256_castsi256_si128(a); + __m128i a1 = _mm256_extracti128_si256(a, 1); + ptr[stride * 0] = _mm_cvtsi128_si32(a0); + ptr[stride * 1] = _mm_extract_epi32(a0, 1); + ptr[stride * 2] = _mm_extract_epi32(a0, 2); + ptr[stride * 3] = _mm_extract_epi32(a0, 3); + ptr[stride * 4] = _mm_cvtsi128_si32(a1); + ptr[stride * 5] = _mm_extract_epi32(a1, 1); + ptr[stride * 6] = _mm_extract_epi32(a1, 2); + ptr[stride * 7] = _mm_extract_epi32(a1, 3); +} +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, _mm256_castps_si256(a)); } +//// 64 +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ + __m128d a0 = _mm256_castpd256_pd128(a); + __m128d a1 = _mm256_extractf128_pd(a, 1); + _mm_storel_pd(ptr + stride * 0, a0); + _mm_storeh_pd(ptr + stride * 1, a0); + _mm_storel_pd(ptr + stride * 2, a1); + _mm_storeh_pd(ptr + stride * 3, a1); +} +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); } +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); } + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + const __m256i vfill = _mm256_set1_epi32(fill); + const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi32(vnlane, steps); + __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi32(vnlane, steps); + return _mm256_maskload_epi32((const int*)ptr, mask); +} +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + const __m256i vfill = _mm256_set1_epi64x(fill); + const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask); + return _mm256_blendv_epi8(vfill, payload, mask); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + return _mm256_maskload_epi64((const void*)ptr, mask); +} +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32); + const __m256i vfill = _mm256_set1_epi32(fill); + const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps); + __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi32(vnlane, steps); + return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } +//// 64 +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + const __m256i vfill = _mm256_set1_epi64x(fill); + const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride); + const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vnlane = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 +npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi32(vnlane, steps); + _mm256_maskstore_epi32((int*)ptr, mask, a); +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3); + __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane); + __m256i mask = _mm256_cmpgt_epi64(vnlane, steps); + _mm256_maskstore_epi64((void*)ptr, mask, a); +} +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + __m128i a0 = _mm256_castsi256_si128(a); + __m128i a1 = _mm256_extracti128_si256(a, 1); + switch(nlane) { + default: + ptr[stride*7] = _mm_extract_epi32(a1, 3); + case 7: + ptr[stride*6] = _mm_extract_epi32(a1, 2); + case 6: + ptr[stride*5] = _mm_extract_epi32(a1, 1); + case 5: + ptr[stride*4] = _mm_extract_epi32(a1, 0); + case 4: + ptr[stride*3] = _mm_extract_epi32(a0, 3); + case 3: + ptr[stride*2] = _mm_extract_epi32(a0, 2); + case 2: + ptr[stride*1] = _mm_extract_epi32(a0, 1); + case 1: + ptr[stride*0] = _mm_extract_epi32(a0, 0); + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a)); + __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1); + double *dptr = (double*)ptr; + switch(nlane) { + default: + _mm_storeh_pd(dptr + stride * 3, a1); + case 3: + _mm_storel_pd(dptr + stride * 2, a1); + case 2: + _mm_storeh_pd(dptr + stride * 1, a0); + case 1: + _mm_storel_pd(dptr + stride * 0, a0); + } +} + +/***************************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast + *****************************************************************************/ +#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64) #endif // _NPY_SIMD_AVX2_MEMORY_H diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h index 96fdf72b91f1..b09d772f2fec 100644 --- a/numpy/core/src/common/simd/avx512/avx512.h +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -4,6 +4,9 @@ #define NPY_SIMD 512 #define NPY_SIMD_WIDTH 64 #define NPY_SIMD_F64 1 +// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_* +#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16) +#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16) typedef __m512i npyv_u8; typedef __m512i npyv_s8; diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h index e212c4555270..bffd6e907246 100644 --- a/numpy/core/src/common/simd/avx512/memory.h +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -90,5 +90,243 @@ NPYV_IMPL_AVX512_MEM_INT(npy_int64, s64) // store higher part #define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC)) #define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC)) +/*************************** + * Non-contiguous Load + ***************************/ +//// 32 +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ + assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32); + const __m512i steps = npyv_set_s32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ); + const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); + return _mm512_i32gather_epi32(idx, (const __m512i*)ptr, 4); +} +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ return _mm512_castsi512_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); } +//// 64 +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ + const __m512i idx = _mm512_setr_epi64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + return _mm512_i64gather_epi64(idx, (const __m512i*)ptr, 8); +} +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); } +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); } +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32); + const __m512i steps = _mm512_setr_epi32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ); + const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); + _mm512_i32scatter_epi32((__m512i*)ptr, idx, a, 4); +} +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen_u32((npy_uint32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); } +//// 64 +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + const __m512i idx = _mm512_setr_epi64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + _mm512_i64scatter_epi64((__m512i*)ptr, idx, a, 8); +} +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen_u64((npy_uint64*)ptr, stride, a); } +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); } + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set1_epi32(fill); + const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr); +} +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + const __m512i vfill = _mm512_set1_epi64(fill); + const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr); +} +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32); + const __m512i steps = npyv_set_s32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ); + const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); + const __m512i vfill = _mm512_set1_epi32(fill); + const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } +//// 64 +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + const __m512i idx = _mm512_setr_epi64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __m512i vfill = _mm512_set1_epi64(fill); + const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 +npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a); +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a); +} +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32); + const __m512i steps = _mm512_setr_epi32( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ); + const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride)); + const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1; + _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4); +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + const __m512i idx = _mm512_setr_epi64( + 0*stride, 1*stride, 2*stride, 3*stride, + 4*stride, 5*stride, 6*stride, 7*stride + ); + const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1; + _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8); +} + +/***************************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast + *****************************************************************************/ +#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64) #endif // _NPY_SIMD_AVX512_MEMORY_H diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h index afa703584dac..1e258f1bcbef 100644 --- a/numpy/core/src/common/simd/neon/memory.h +++ b/numpy/core/src/common/simd/neon/memory.h @@ -5,6 +5,8 @@ #ifndef _NPY_SIMD_NEON_MEMORY_H #define _NPY_SIMD_NEON_MEMORY_H +#include "misc.h" + /*************************** * load/store ***************************/ @@ -45,5 +47,290 @@ NPYV_IMPL_NEON_MEM(f32, float) #if NPY_SIMD_F64 NPYV_IMPL_NEON_MEM(f64, double) #endif +/*************************** + * Non-contiguous Load + ***************************/ +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ + switch (stride) { + case 2: + return vld2q_s32((const int32_t*)ptr).val[0]; + case 3: + return vld3q_s32((const int32_t*)ptr).val[0]; + case 4: + return vld4q_s32((const int32_t*)ptr).val[0]; + default:; + int32x2_t ax = vcreate_s32(*ptr); + int32x4_t a = vcombine_s32(ax, ax); + a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); + a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); + a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); + return a; + } +} + +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return npyv_reinterpret_u32_s32( + npyv_loadn_s32((const npy_int32*)ptr, stride) + ); +} +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ + return npyv_reinterpret_f32_s32( + npyv_loadn_s32((const npy_int32*)ptr, stride) + ); +} +//// 64 +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ + return vcombine_s64( + vld1_s64((const int64_t*)ptr), vld1_s64((const int64_t*)ptr + stride) + ); +} +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ + return npyv_reinterpret_u64_s64( + npyv_loadn_s64((const npy_int64*)ptr, stride) + ); +} +#if NPY_SIMD_F64 +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ + return npyv_reinterpret_f64_s64( + npyv_loadn_s64((const npy_int64*)ptr, stride) + ); +} +#endif +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ + vst1q_lane_s32((int32_t*)ptr, a, 0); + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); +} +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); } +//// 64 +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ + vst1q_lane_s64((int64_t*)ptr, a, 0); + vst1q_lane_s64((int64_t*)ptr + stride, a, 1); +} +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); } + +#if NPY_SIMD_F64 +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); } +#endif + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + return vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0); + case 2: + return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill)); + case 3: + return vcombine_s32( + vld1_s32((const int32_t*)ptr), + vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0) + ); + default: + return npyv_load_s32(ptr); + } +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_till_s32(ptr, nlane, 0); } +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill)); + } + return npyv_load_s64(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ return npyv_load_till_s64(ptr, nlane, 0); } + +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + int32x4_t vfill = vdupq_n_s32(fill); + switch(nlane) { + case 3: + vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2); + case 2: + vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1); + case 1: + vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0); + return vfill; + default: + return npyv_loadn_s32(ptr, stride); + } +} +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } + +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill)); + } + return npyv_loadn_s64(ptr, stride); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + vst1q_lane_s32((int32_t*)ptr, a, 0); + break; + case 2: + vst1_s32((int32_t*)ptr, vget_low_s32(a)); + break; + case 3: + vst1_s32((int32_t*)ptr, vget_low_s32(a)); + vst1q_lane_s32((int32_t*)ptr + 2, a, 2); + break; + default: + npyv_store_s32(ptr, a); + } +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + vst1q_lane_s64((int64_t*)ptr, a, 0); + return; + } + npyv_store_s64(ptr, a); +} +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + default: + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); + case 3: + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + case 2: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + case 1: + vst1q_lane_s32((int32_t*)ptr, a, 0); + break; + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + vst1q_lane_s64((int64_t*)ptr, a, 0); + return; + } + npyv_storen_s64(ptr, stride, a); +} + +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64) +#if NPY_SIMD_F64 +NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64) +#endif #endif // _NPY_SIMD_NEON_MEMORY_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h index 2f39c8427b5d..8804223c9fef 100644 --- a/numpy/core/src/common/simd/simd.h +++ b/numpy/core/src/common/simd/simd.h @@ -49,6 +49,55 @@ typedef double npyv_lanetype_f64; #define NPY_SIMD_WIDTH 0 #define NPY_SIMD_F64 0 #endif +/** + * Some SIMD extensions currently(AVX2, AVX512F) require (de facto) + * a maximum number of strides sizes when dealing with non-contiguous memory access. + * + * Therefore the following functions must be used to check the maximum + * acceptable limit of strides before using any of non-contiguous load/store intrinsics. + * + * For instance: + * npy_intp ld_stride = step[0] / sizeof(float); + * npy_intp st_stride = step[1] / sizeof(float); + * + * if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) { + * for (;;) + * npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride); + * // ... + * npyv_storen_f32(st_pointer, st_stride, a); + * } + * else { + * for (;;) + * // C scalars + * } + */ +#ifndef NPY_SIMD_MAXLOAD_STRIDE32 + #define NPY_SIMD_MAXLOAD_STRIDE32 0 +#endif +#ifndef NPY_SIMD_MAXSTORE_STRIDE32 + #define NPY_SIMD_MAXSTORE_STRIDE32 0 +#endif +#ifndef NPY_SIMD_MAXLOAD_STRIDE64 + #define NPY_SIMD_MAXLOAD_STRIDE64 0 +#endif +#ifndef NPY_SIMD_MAXSTORE_STRIDE64 + #define NPY_SIMD_MAXSTORE_STRIDE64 0 +#endif +#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \ + NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \ + { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \ + NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \ + { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; } +#if NPY_SIMD + NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) + NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) + NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) + NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) + NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) +#endif +#if NPY_SIMD_F64 + NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) +#endif #ifdef __cplusplus } diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h index 1a555d6f03de..1074c3b02efe 100644 --- a/numpy/core/src/common/simd/sse/memory.h +++ b/numpy/core/src/common/simd/sse/memory.h @@ -5,6 +5,8 @@ #ifndef _NPY_SIMD_SSE_MEMORY_H #define _NPY_SIMD_SSE_MEMORY_H +#include "misc.h" + /*************************** * load/store ***************************/ @@ -70,5 +72,427 @@ NPYV_IMPL_SSE_MEM_INT(npy_int64, s64) // store higher part #define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC)) #define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC)) +/*************************** + * Non-contiguous Load + ***************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ + __m128i a = _mm_cvtsi32_si128(*ptr); +#ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + a = _mm_insert_epi32(a, ptr[stride*3], 3); +#else + __m128i a1 = _mm_cvtsi32_si128(ptr[stride]); + __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]); + __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]); + a = _mm_unpacklo_epi32(a, a1); + a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3)); +#endif + return a; +} +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ return npyv_loadn_s32((const npy_int32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); } +//// 64 +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); } +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); } +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ + ptr[stride * 0] = _mm_cvtsi128_si32(a); +#ifdef NPY_HAVE_SSE41 + ptr[stride * 1] = _mm_extract_epi32(a, 1); + ptr[stride * 2] = _mm_extract_epi32(a, 2); + ptr[stride * 3] = _mm_extract_epi32(a, 3); +#else + ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); + ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); + ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); +#endif +} +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, a); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); } +//// 64 +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ + _mm_storel_pd(ptr, a); + _mm_storeh_pd(ptr + stride, a); +} +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); } +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); } + +/********************************* + * Partial Load + *********************************/ +#if defined(__clang__) && __clang_major__ > 7 + /** + * Clang >=8 perform aggressive optimization that tends to + * zero the bits of upper half part of vectors even + * when we try to fill it up with certain scalars, + * which my lead to zero division errors. + */ + #define NPYV__CLANG_ZEROUPPER +#endif +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); +#ifdef NPYV__CLANG_ZEROUPPER + if (nlane > 3) { + return npyv_load_s32(ptr); + } + npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill}; + for (npy_uint64 i = 0; i < nlane; ++i) { + data[i] = ptr[i]; + } + return npyv_loada_s32(data); +#else + #ifndef NPY_HAVE_SSE41 + const short *wptr = (const short*)ptr; + #endif + const __m128i vfill = npyv_setall_s32(fill); + __m128i a; + switch(nlane) { + case 2: + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + #ifdef NPY_HAVE_SSE41 + case 1: + return _mm_insert_epi32(vfill, ptr[0], 0); + case 3: + a = _mm_loadl_epi64((const __m128i*)ptr); + a = _mm_insert_epi32(a, ptr[2], 2); + a = _mm_insert_epi32(a, fill, 3); + return a; + #else + case 1: + a = _mm_insert_epi16(vfill, wptr[0], 0); + return _mm_insert_epi16(a, wptr[1], 1); + case 3: + a = _mm_loadl_epi64((const __m128i*)ptr); + a = _mm_unpacklo_epi64(a, vfill); + a = _mm_insert_epi16(a, wptr[4], 4); + a = _mm_insert_epi16(a, wptr[5], 5); + return a; + #endif // NPY_HAVE_SSE41 + default: + return npyv_load_s32(ptr); + } +#endif +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + return _mm_cvtsi32_si128(*ptr); + case 2: + return _mm_loadl_epi64((const __m128i*)ptr); + case 3:; + npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr); + #ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[2], 2); + #else + return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2])); + #endif + default: + return npyv_load_s32(ptr); + } +} +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); +#ifdef NPYV__CLANG_ZEROUPPER + if (nlane <= 2) { + npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill}; + for (npy_uint64 i = 0; i < nlane; ++i) { + data[i] = ptr[i]; + } + return npyv_loada_s64(data); + } +#else + if (nlane == 1) { + const __m128i vfill = npyv_setall_s64(fill); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } +#endif + return npyv_load_s64(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return _mm_loadl_epi64((const __m128i*)ptr); + } + return npyv_load_s64(ptr); +} +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); +#ifdef NPYV__CLANG_ZEROUPPER + if (nlane > 3) { + return npyv_loadn_s32(ptr, stride); + } + npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill}; + for (npy_uint64 i = 0; i < nlane; ++i) { + data[i] = ptr[stride*i]; + } + return npyv_loada_s32(data); +#else + __m128i vfill = npyv_setall_s32(fill); + #ifndef NPY_HAVE_SSE41 + const short *wptr = (const short*)ptr; + #endif + switch(nlane) { + #ifdef NPY_HAVE_SSE41 + case 3: + vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2); + case 2: + vfill = _mm_insert_epi32(vfill, ptr[stride], 1); + case 1: + vfill = _mm_insert_epi32(vfill, ptr[0], 0); + break; + #else + case 3: + vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill); + case 2: + vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32( + _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride]) + ), vfill); + break; + case 1: + vfill = _mm_insert_epi16(vfill, wptr[0], 0); + vfill = _mm_insert_epi16(vfill, wptr[1], 1); + break; + #endif // NPY_HAVE_SSE41 + default: + return npyv_loadn_s32(ptr, stride); + } // switch + return vfill; +#endif +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + return _mm_cvtsi32_si128(ptr[0]); + case 2:; + npyv_s32 a = _mm_cvtsi32_si128(ptr[0]); +#ifdef NPY_HAVE_SSE41 + return _mm_insert_epi32(a, ptr[stride], 1); +#else + return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); +#endif // NPY_HAVE_SSE41 + case 3:; + a = _mm_cvtsi32_si128(ptr[0]); +#ifdef NPY_HAVE_SSE41 + a = _mm_insert_epi32(a, ptr[stride], 1); + a = _mm_insert_epi32(a, ptr[stride*2], 2); + return a; +#else + a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride])); + a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2])); + return a; +#endif // NPY_HAVE_SSE41 + default: + return npyv_loadn_s32(ptr, stride); + } +} +//// 64 +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); +#ifdef NPYV__CLANG_ZEROUPPER + if (nlane <= 2) { + npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill}; + for (npy_uint64 i = 0; i < nlane; ++i) { + data[i] = ptr[i*stride]; + } + return npyv_loada_s64(data); + } +#else + if (nlane == 1) { + const __m128i vfill = npyv_setall_s64(fill); + return _mm_castpd_si128( + _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr) + ); + } +#endif + return npyv_loadn_s64(ptr, stride); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + return _mm_loadl_epi64((const __m128i*)ptr); + } + return npyv_loadn_s64(ptr, stride); +} +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + *ptr = _mm_cvtsi128_si32(a); + break; + case 2: + _mm_storel_epi64((__m128i *)ptr, a); + break; + case 3: + _mm_storel_epi64((__m128i *)ptr, a); + #ifdef NPY_HAVE_SSE41 + ptr[2] = _mm_extract_epi32(a, 2); + #else + ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); + #endif + break; + default: + npyv_store_s32(ptr, a); + } +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + _mm_storel_epi64((__m128i *)ptr, a); + return; + } + npyv_store_s64(ptr, a); +} +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { +#ifdef NPY_HAVE_SSE41 + default: + ptr[stride*3] = _mm_extract_epi32(a, 3); + case 3: + ptr[stride*2] = _mm_extract_epi32(a, 2); + case 2: + ptr[stride*1] = _mm_extract_epi32(a, 1); +#else + default: + ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3))); + case 3: + ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2))); + case 2: + ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1))); +#endif + case 1: + ptr[stride*0] = _mm_cvtsi128_si32(a); + break; + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + _mm_storel_epi64((__m128i *)ptr, a); + return; + } + npyv_storen_s64(ptr, stride, a); +} +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64) #endif // _NPY_SIMD_SSE_MEMORY_H diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h index e0d908bf906b..08a0a9276cc6 100644 --- a/numpy/core/src/common/simd/vsx/memory.h +++ b/numpy/core/src/common/simd/vsx/memory.h @@ -4,147 +4,343 @@ #ifndef _NPY_SIMD_VSX_MEMORY_H #define _NPY_SIMD_VSX_MEMORY_H + +#include "misc.h" + /**************************** - * load/store + * Private utilities ****************************/ // TODO: test load by cast #define VSX__CAST_lOAD 0 #if VSX__CAST_lOAD - #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR))) + #define npyv__load(T_VEC, PTR) (*((T_VEC*)(PTR))) #else /** * CLANG fails to load unaligned addresses via vec_xl, vec_xst * so we failback to vec_vsx_ld, vec_vsx_st */ #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) - #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR) + #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR) #else - #define npyv__load(PTR, T_VEC) vec_xl(0, PTR) + #define npyv__load(T_VEC, PTR) vec_xl(0, PTR) #endif #endif -// unaligned load -#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8) -#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8) -#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16) -#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16) -#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32) -#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32) -#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32) -#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64) -#if VSX__CAST_lOAD - #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64) - #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64) +// unaligned store +#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR) #else - #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR)) - #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR)) + #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR) #endif -// aligned load -#define npyv_loada_u8(PTR) vec_ld(0, PTR) -#define npyv_loada_s8 npyv_loada_u8 -#define npyv_loada_u16 npyv_loada_u8 -#define npyv_loada_s16 npyv_loada_u8 -#define npyv_loada_u32 npyv_loada_u8 -#define npyv_loada_s32 npyv_loada_u8 -#define npyv_loada_u64 npyv_load_u64 -#define npyv_loada_s64 npyv_load_s64 -#define npyv_loada_f32 npyv_loada_u8 -#define npyv_loada_f64 npyv_load_f64 -// stream load -#define npyv_loads_u8 npyv_loada_u8 -#define npyv_loads_s8 npyv_loada_s8 -#define npyv_loads_u16 npyv_loada_u16 -#define npyv_loads_s16 npyv_loada_s16 -#define npyv_loads_u32 npyv_loada_u32 -#define npyv_loads_s32 npyv_loada_s32 -#define npyv_loads_u64 npyv_loada_u64 -#define npyv_loads_s64 npyv_loada_s64 -#define npyv_loads_f32 npyv_loada_f32 -#define npyv_loads_f64 npyv_loada_f64 -// load lower part + // avoid aliasing rules #ifdef __cplusplus template - NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr) - { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; } + NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr) + { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } #else - NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr) - { npy_uint64 *ptr64 = ptr; return ptr64; } + NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr) + { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; } #endif // __cplusplus -#if defined(__clang__) && !defined(__IBMC__) - // vec_promote doesn't support doubleword on clang - #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR)) -#else - #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0) -#endif -#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR)) -#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR)) -#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR)) -#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR)) -#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR)) -#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR)) -#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR)) -#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR)) -#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR)) -// unaligned store -#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) - #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR) -#else - #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR) -#endif -#define npyv_store_s8 npyv_store_u8 -#define npyv_store_u16 npyv_store_u8 -#define npyv_store_s16 npyv_store_u8 -#define npyv_store_u32 npyv_store_u8 -#define npyv_store_s32 npyv_store_u8 -#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) -#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) -#define npyv_store_f32 npyv_store_u8 -#define npyv_store_f64 npyv_store_u8 -// aligned store -#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR) -#define npyv_storea_s8 npyv_storea_u8 -#define npyv_storea_u16 npyv_storea_u8 -#define npyv_storea_s16 npyv_storea_u8 -#define npyv_storea_u32 npyv_storea_u8 -#define npyv_storea_s32 npyv_storea_u8 -#define npyv_storea_u64 npyv_store_u64 -#define npyv_storea_s64 npyv_store_s64 -#define npyv_storea_f32 npyv_storea_u8 -#define npyv_storea_f64 npyv_store_f64 -// stream store -#define npyv_stores_u8 npyv_storea_u8 -#define npyv_stores_s8 npyv_storea_s8 -#define npyv_stores_u16 npyv_storea_u16 -#define npyv_stores_s16 npyv_storea_s16 -#define npyv_stores_u32 npyv_storea_u32 -#define npyv_stores_s32 npyv_storea_s32 -#define npyv_stores_u64 npyv_storea_u64 -#define npyv_stores_s64 npyv_storea_s64 -#define npyv_stores_f32 npyv_storea_f32 -#define npyv_stores_f64 npyv_storea_f64 + +// load lower part +NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr) +{ + #if defined(__clang__) && !defined(__IBMC__) + // vec_promote doesn't support doubleword on clang + return npyv_setall_u64(*npyv__ptr2u64(ptr)); + #else + return vec_promote(*npyv__ptr2u64(ptr), 0); + #endif +} // store lower part -#define npyv_storel_u8(PTR, VEC) \ +#define npyv__storel(PTR, VEC) \ *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0) -#define npyv_storel_s8 npyv_storel_u8 -#define npyv_storel_u16 npyv_storel_u8 -#define npyv_storel_s16 npyv_storel_u8 -#define npyv_storel_u32 npyv_storel_u8 -#define npyv_storel_s32 npyv_storel_u8 -#define npyv_storel_s64 npyv_storel_u8 -#define npyv_storel_u64 npyv_storel_u8 -#define npyv_storel_f32 npyv_storel_u8 -#define npyv_storel_f64 npyv_storel_u8 -// store higher part -#define npyv_storeh_u8(PTR, VEC) \ + +#define npyv__storeh(PTR, VEC) \ *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1) -#define npyv_storeh_s8 npyv_storeh_u8 -#define npyv_storeh_u16 npyv_storeh_u8 -#define npyv_storeh_s16 npyv_storeh_u8 -#define npyv_storeh_u32 npyv_storeh_u8 -#define npyv_storeh_s32 npyv_storeh_u8 -#define npyv_storeh_s64 npyv_storeh_u8 -#define npyv_storeh_u64 npyv_storeh_u8 -#define npyv_storeh_f32 npyv_storeh_u8 -#define npyv_storeh_f64 npyv_storeh_u8 + +/**************************** + * load/store + ****************************/ +#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return npyv_loada_##SFX(ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return (npyv_##SFX)npyv__loadl(ptr); } \ + NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); } \ + NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); } \ + NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { npyv_storea_##SFX(ptr, vec); } \ + NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { npyv__storel(ptr, vec); } \ + NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { npyv__storeh(ptr, vec); } + +NPYV_IMPL_VSX_MEM(u8, u8) +NPYV_IMPL_VSX_MEM(s8, s8) +NPYV_IMPL_VSX_MEM(u16, u16) +NPYV_IMPL_VSX_MEM(s16, s16) +NPYV_IMPL_VSX_MEM(u32, u32) +NPYV_IMPL_VSX_MEM(s32, s32) +NPYV_IMPL_VSX_MEM(u64, f64) +NPYV_IMPL_VSX_MEM(s64, f64) +NPYV_IMPL_VSX_MEM(f32, f32) +NPYV_IMPL_VSX_MEM(f64, f64) + +/*************************** + * Non-contiguous Load + ***************************/ +//// 32 +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return npyv_set_u32( + ptr[stride * 0], ptr[stride * 1], + ptr[stride * 2], ptr[stride * 3] + ); +} +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); } +//// 64 +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ return npyv_set_u64(ptr[0], ptr[stride]); } +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_set_s64(ptr[0], ptr[stride]); } +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return npyv_set_f64(ptr[0], ptr[stride]); } +/*************************** + * Non-contiguous Store + ***************************/ +//// 32 +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + ptr[stride * 0] = vec_extract(a, 0); + ptr[stride * 1] = vec_extract(a, 1); + ptr[stride * 2] = vec_extract(a, 2); + ptr[stride * 3] = vec_extract(a, 3); +} +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); } +//// 64 +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + ptr[stride * 0] = vec_extract(a, 0); + ptr[stride * 1] = vec_extract(a, 1); +} +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); } +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); } + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + npyv_s32 vfill = npyv_setall_s32(fill); + switch(nlane) { + case 1: + return vec_insert(ptr[0], vfill, 0); + case 2: + return (npyv_s32)vec_insert( + *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0 + ); + case 3: + vfill = vec_insert(ptr[2], vfill, 2); + return (npyv_s32)vec_insert( + *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0 + ); + default: + return npyv_load_s32(ptr); + } +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_till_s32(ptr, nlane, 0); } +//// 64 +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s64(ptr[0], fill); + } + return npyv_load_s64(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ return npyv_load_till_s64(ptr, nlane, 0); } +/********************************* + * Non-contiguous partial load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + npyv_s32 vfill = npyv_setall_s32(fill); + switch(nlane) { + case 3: + vfill = vec_insert(ptr[stride*2], vfill, 2); + case 2: + vfill = vec_insert(ptr[stride], vfill, 1); + case 1: + vfill = vec_insert(*ptr, vfill, 0); + break; + default: + return npyv_loadn_s32(ptr, stride); + } // switch + return vfill; +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 +npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } +//// 64 +NPY_FINLINE npyv_s64 +npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_set_s64(*ptr, fill); + } + return npyv_loadn_s64(ptr, stride); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + *ptr = vec_extract(a, 0); + break; + case 2: + npyv_storel_s32(ptr, a); + break; + case 3: + npyv_storel_s32(ptr, a); + ptr[2] = vec_extract(a, 2); + break; + default: + npyv_store_s32(ptr, a); + } +} +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + npyv_storel_s64(ptr, a); + return; + } + npyv_store_s64(ptr, a); +} +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + default: + ptr[stride*3] = vec_extract(a, 3); + case 3: + ptr[stride*2] = vec_extract(a, 2); + case 2: + ptr[stride*1] = vec_extract(a, 1); + case 1: + ptr[stride*0] = vec_extract(a, 0); + break; + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + npyv_storel_s64(ptr, a); + return; + } + npyv_storen_s64(ptr, stride, a); +} +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun = {.from_##F_SFX = fill}; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64) #endif // _NPY_SIMD_VSX_MEMORY_H