|
| 1 | +#ifndef NPY_SIMD |
| 2 | + #error "Not a standalone header" |
| 3 | +#endif |
| 4 | + |
| 5 | +#ifndef _NPY_SIMD_VSX_MEMORY_H |
| 6 | +#define _NPY_SIMD_VSX_MEMORY_H |
| 7 | +/**************************** |
| 8 | + * load/store |
| 9 | + ****************************/ |
| 10 | +// TODO: test load by cast |
| 11 | +#define VSX__CAST_lOAD 0 |
| 12 | +#if VSX__CAST_lOAD |
| 13 | + #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR))) |
| 14 | +#else |
| 15 | + /** |
| 16 | + * CLANG fails to load unaligned addresses via vec_xl, vec_xst |
| 17 | + * so we failback to vec_vsx_ld, vec_vsx_st |
| 18 | + */ |
| 19 | + #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) |
| 20 | + #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR) |
| 21 | + #else |
| 22 | + #define npyv__load(PTR, T_VEC) vec_xl(0, PTR) |
| 23 | + #endif |
| 24 | +#endif |
| 25 | +// unaligned load |
| 26 | +#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8) |
| 27 | +#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8) |
| 28 | +#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16) |
| 29 | +#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16) |
| 30 | +#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32) |
| 31 | +#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32) |
| 32 | +#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32) |
| 33 | +#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64) |
| 34 | +#if VSX__CAST_lOAD |
| 35 | + #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64) |
| 36 | + #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64) |
| 37 | +#else |
| 38 | + #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR)) |
| 39 | + #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR)) |
| 40 | +#endif |
| 41 | +// aligned load |
| 42 | +#define npyv_loada_u8(PTR) vec_ld(0, PTR) |
| 43 | +#define npyv_loada_s8 npyv_loada_u8 |
| 44 | +#define npyv_loada_u16 npyv_loada_u8 |
| 45 | +#define npyv_loada_s16 npyv_loada_u8 |
| 46 | +#define npyv_loada_u32 npyv_loada_u8 |
| 47 | +#define npyv_loada_s32 npyv_loada_u8 |
| 48 | +#define npyv_loada_u64 npyv_load_u64 |
| 49 | +#define npyv_loada_s64 npyv_load_s64 |
| 50 | +#define npyv_loada_f32 npyv_loada_u8 |
| 51 | +#define npyv_loada_f64 npyv_load_f64 |
| 52 | +// stream load |
| 53 | +#define npyv_loads_u8 npyv_loada_u8 |
| 54 | +#define npyv_loads_s8 npyv_loada_s8 |
| 55 | +#define npyv_loads_u16 npyv_loada_u16 |
| 56 | +#define npyv_loads_s16 npyv_loada_s16 |
| 57 | +#define npyv_loads_u32 npyv_loada_u32 |
| 58 | +#define npyv_loads_s32 npyv_loada_s32 |
| 59 | +#define npyv_loads_u64 npyv_loada_u64 |
| 60 | +#define npyv_loads_s64 npyv_loada_s64 |
| 61 | +#define npyv_loads_f32 npyv_loada_f32 |
| 62 | +#define npyv_loads_f64 npyv_loada_f64 |
| 63 | +// load lower part |
| 64 | +// avoid aliasing rules |
| 65 | +#ifdef __cplusplus |
| 66 | + template<typename T_PTR> |
| 67 | + NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr) |
| 68 | + { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; } |
| 69 | +#else |
| 70 | + NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr) |
| 71 | + { npy_uint64 *ptr64 = ptr; return ptr64; } |
| 72 | +#endif // __cplusplus |
| 73 | +#if defined(__clang__) && !defined(__IBMC__) |
| 74 | + // vec_promote doesn't support doubleword on clang |
| 75 | + #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR)) |
| 76 | +#else |
| 77 | + #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0) |
| 78 | +#endif |
| 79 | +#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR)) |
| 80 | +#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR)) |
| 81 | +#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR)) |
| 82 | +#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR)) |
| 83 | +#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR)) |
| 84 | +#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR)) |
| 85 | +#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR)) |
| 86 | +#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR)) |
| 87 | +#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR)) |
| 88 | +// unaligned store |
| 89 | +#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) |
| 90 | + #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR) |
| 91 | +#else |
| 92 | + #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR) |
| 93 | +#endif |
| 94 | +#define npyv_store_s8 npyv_store_u8 |
| 95 | +#define npyv_store_u16 npyv_store_u8 |
| 96 | +#define npyv_store_s16 npyv_store_u8 |
| 97 | +#define npyv_store_u32 npyv_store_u8 |
| 98 | +#define npyv_store_s32 npyv_store_u8 |
| 99 | +#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) |
| 100 | +#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) |
| 101 | +#define npyv_store_f32 npyv_store_u8 |
| 102 | +#define npyv_store_f64 npyv_store_u8 |
| 103 | +// aligned store |
| 104 | +#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR) |
| 105 | +#define npyv_storea_s8 npyv_storea_u8 |
| 106 | +#define npyv_storea_u16 npyv_storea_u8 |
| 107 | +#define npyv_storea_s16 npyv_storea_u8 |
| 108 | +#define npyv_storea_u32 npyv_storea_u8 |
| 109 | +#define npyv_storea_s32 npyv_storea_u8 |
| 110 | +#define npyv_storea_u64 npyv_store_u64 |
| 111 | +#define npyv_storea_s64 npyv_store_s64 |
| 112 | +#define npyv_storea_f32 npyv_storea_u8 |
| 113 | +#define npyv_storea_f64 npyv_store_f64 |
| 114 | +// stream store |
| 115 | +#define npyv_stores_u8 npyv_storea_u8 |
| 116 | +#define npyv_stores_s8 npyv_storea_s8 |
| 117 | +#define npyv_stores_u16 npyv_storea_u16 |
| 118 | +#define npyv_stores_s16 npyv_storea_s16 |
| 119 | +#define npyv_stores_u32 npyv_storea_u32 |
| 120 | +#define npyv_stores_s32 npyv_storea_s32 |
| 121 | +#define npyv_stores_u64 npyv_storea_u64 |
| 122 | +#define npyv_stores_s64 npyv_storea_s64 |
| 123 | +#define npyv_stores_f32 npyv_storea_f32 |
| 124 | +#define npyv_stores_f64 npyv_storea_f64 |
| 125 | +// store lower part |
| 126 | +#define npyv_storel_u8(PTR, VEC) \ |
| 127 | + *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0) |
| 128 | +#define npyv_storel_s8 npyv_storel_u8 |
| 129 | +#define npyv_storel_u16 npyv_storel_u8 |
| 130 | +#define npyv_storel_s16 npyv_storel_u8 |
| 131 | +#define npyv_storel_u32 npyv_storel_u8 |
| 132 | +#define npyv_storel_s32 npyv_storel_u8 |
| 133 | +#define npyv_storel_s64 npyv_storel_u8 |
| 134 | +#define npyv_storel_u64 npyv_storel_u8 |
| 135 | +#define npyv_storel_f32 npyv_storel_u8 |
| 136 | +#define npyv_storel_f64 npyv_storel_u8 |
| 137 | +// store higher part |
| 138 | +#define npyv_storeh_u8(PTR, VEC) \ |
| 139 | + *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1) |
| 140 | +#define npyv_storeh_s8 npyv_storeh_u8 |
| 141 | +#define npyv_storeh_u16 npyv_storeh_u8 |
| 142 | +#define npyv_storeh_s16 npyv_storeh_u8 |
| 143 | +#define npyv_storeh_u32 npyv_storeh_u8 |
| 144 | +#define npyv_storeh_s32 npyv_storeh_u8 |
| 145 | +#define npyv_storeh_s64 npyv_storeh_u8 |
| 146 | +#define npyv_storeh_u64 npyv_storeh_u8 |
| 147 | +#define npyv_storeh_f32 npyv_storeh_u8 |
| 148 | +#define npyv_storeh_f64 npyv_storeh_u8 |
| 149 | + |
| 150 | +#endif // _NPY_SIMD_VSX_MEMORY_H |
0 commit comments