@@ -3023,31 +3023,148 @@ struct DecimateAlpha
3023
3023
float alpha;
3024
3024
};
3025
3025
3026
+ namespace
3027
+ {
3028
+ static inline void vx_load_as (const uchar* ptr, v_float32& a)
3029
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand_q (ptr))); }
3030
+
3031
+ static inline void vx_load_as (const schar* ptr, v_float32& a)
3032
+ { a = v_cvt_f32 (vx_load_expand_q (ptr)); }
3033
+
3034
+ static inline void vx_load_as (const ushort* ptr, v_float32& a)
3035
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (ptr))); }
3036
+
3037
+ static inline void vx_load_as (const short * ptr, v_float32& a)
3038
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (ptr))); }
3039
+
3040
+ static inline void vx_load_as (const int * ptr, v_float32& a)
3041
+ { a = v_cvt_f32 (vx_load (ptr)); }
3042
+
3043
+ static inline void vx_load_as (const float * ptr, v_float32& a)
3044
+ { a = vx_load (ptr); }
3045
+
3046
+ static inline void vx_load_as (const float16_t * ptr, v_float32& a)
3047
+ { a = vx_load_expand (ptr); }
3048
+
3049
+ static inline void v_store_as (uchar* ptr, const v_float32& a)
3050
+ {
3051
+ constexpr int VECSZ = v_float32::nlanes;
3052
+ #if 0
3053
+ ushort b[VECSZ];
3054
+ v_pack_u_store(b, v_round(a));
3055
+ for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
3056
+ #else
3057
+ float b[VECSZ];
3058
+ v_store (b, a);
3059
+ for ( int i = 0 ; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
3060
+ #endif
3061
+ }
3062
+
3063
+ static inline void v_store_as (schar* ptr, const v_float32& a)
3064
+ {
3065
+ constexpr int VECSZ = v_float32::nlanes;
3066
+ #if 0
3067
+ short b[VECSZ];
3068
+ v_pack_store(b, v_round(a));
3069
+ for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
3070
+ #else
3071
+ float b[VECSZ];
3072
+ v_store (b, a);
3073
+ for ( int i = 0 ; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
3074
+ #endif
3075
+ }
3076
+
3077
+ static inline void v_store_as (ushort* ptr, const v_float32& a)
3078
+ { v_pack_u_store (ptr, v_round (a)); }
3079
+
3080
+ static inline void v_store_as (short * ptr, const v_float32& a)
3081
+ { v_pack_store (ptr, v_round (a)); }
3082
+
3083
+ static inline void v_store_as (int * ptr, const v_float32& a)
3084
+ { v_store (ptr, v_round (a)); }
3085
+
3086
+ static inline void v_store_as (float * ptr, const v_float32& a)
3087
+ { v_store (ptr, a); }
3088
+
3089
+ static inline void v_store_as (float16_t * ptr, const v_float32& a)
3090
+ { v_pack_store (ptr, a); }
3091
+
3026
3092
template <typename VT>
3027
3093
VT vx_setall_local (double coeff);
3028
3094
template <>
3029
3095
v_float32 vx_setall_local (double coeff) {
3030
3096
return v_setall_f32 (coeff);
3031
3097
}
3032
3098
template <typename WT, typename VT>
3033
- void v_inter_area_set_sum (int step, int col_end, const WT *const buf, const VT &v_coeff,
3099
+ void v_inter_area_set_sum (int col_end, const WT *const buf, const VT &v_coeff,
3034
3100
WT *sum, int &x) {
3101
+ constexpr int step = VT::nlanes;
3035
3102
for (x = 0 ; x + step < col_end; x += step)
3036
3103
{
3037
3104
const VT line = vx_load (buf + x);
3038
3105
v_store (sum + x, line * v_coeff);
3039
3106
}
3040
3107
}
3041
3108
template <typename WT, typename VT>
3042
- void v_inter_area_update_sum (int step, int col_end, const WT *const buf, const VT &v_coeff,
3109
+ void v_inter_area_update_sum (int col_end, const WT *const buf, const VT &v_coeff,
3043
3110
WT *sum, int &x) {
3111
+ constexpr int step = VT::nlanes;
3044
3112
for (x = 0 ; x + step < col_end; x += step)
3045
3113
{
3046
3114
const VT line = vx_load (buf + x);
3047
- VT sum_x = vx_load (sum + x);
3115
+ const VT sum_x = vx_load (sum + x);
3048
3116
v_store (sum + x, sum_x + line * v_coeff);
3049
3117
}
3050
3118
}
3119
+ template <typename S>
3120
+ void v_inter_area_copy_or_not (const S* s, int n, float *d, float const **buf)
3121
+ {
3122
+ static_assert (!std::is_same<S, float >::value, " Do not specialize for float" );
3123
+ constexpr int step = v_float32::nlanes;
3124
+ int x = 0 ;
3125
+ for (; x + step < n; x += step)
3126
+ {
3127
+ v_float32 a;
3128
+ vx_load_as (s + x, a);
3129
+ v_store (d + x, a);
3130
+ }
3131
+ for (; x < n; ++x) d[x] = saturate_cast<float >(s[x]);
3132
+ *buf = d;
3133
+ }
3134
+ void v_inter_area_copy_or_not (const double * s, int n, double *d, const double **buf)
3135
+ {
3136
+ (void )n;
3137
+ (void )d;
3138
+ *buf = s;
3139
+ }
3140
+ void v_inter_area_copy_or_not (const float * s, int n, float *d, const float **buf)
3141
+ {
3142
+ (void )n;
3143
+ (void )d;
3144
+ *buf = s;
3145
+ }
3146
+ template <typename D>
3147
+ void v_inter_area_copy (const float * s, int n, D *d)
3148
+ {
3149
+ static_assert (!std::is_same<float , D>::value, " Do not specialize for float" );
3150
+ constexpr int step = v_float32::nlanes;
3151
+ int x = 0 ;
3152
+ for (; x + step < n; x += step)
3153
+ {
3154
+ v_float32 a = vx_load (s + x);
3155
+ v_store_as (d + x, a);
3156
+ }
3157
+ for (; x < n; ++x) d[x] = saturate_cast<D>(s[x]);
3158
+ }
3159
+ void v_inter_area_copy (const double * s, int n, double *d)
3160
+ {
3161
+ std::copy (s, s + n, d);
3162
+ }
3163
+ void v_inter_area_copy (const float * s, int n, float *d)
3164
+ {
3165
+ std::copy (s, s + n, d);
3166
+ }
3167
+
3051
3168
#if CV_SIMD128_64F
3052
3169
template <>
3053
3170
v_float64 vx_setall_local (double coeff) {
@@ -3060,26 +3177,25 @@ v_uint8 vx_setall_local(double coeff) {
3060
3177
return v_setall_u8 (0 );
3061
3178
}
3062
3179
template <>
3063
- void v_inter_area_set_sum (int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3180
+ void v_inter_area_set_sum (int col_end, const double *const buf, const v_uint8 &v_coeff,
3064
3181
double *sum, int &x) {
3065
- (void )step;
3066
3182
(void )col_end;
3067
3183
(void )buf;
3068
3184
(void )v_coeff;
3069
3185
(void )sum;
3070
3186
x = 0 ;
3071
3187
}
3072
3188
template <>
3073
- void v_inter_area_update_sum (int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3189
+ void v_inter_area_update_sum (int col_end, const double *const buf, const v_uint8 &v_coeff,
3074
3190
double *sum, int &x) {
3075
- (void )step;
3076
3191
(void )col_end;
3077
3192
(void )buf;
3078
3193
(void )v_coeff;
3079
3194
(void )sum;
3080
3195
x = 0 ;
3081
3196
}
3082
3197
#endif
3198
+ }
3083
3199
3084
3200
template <typename T, typename WT, typename VT>
3085
3201
class ResizeArea_Invoker : public ParallelLoopBody
@@ -3104,10 +3220,11 @@ class ResizeArea_Invoker : public ParallelLoopBody
3104
3220
Size dsize = dst->size ();
3105
3221
int cn = dst->channels ();
3106
3222
dsize.width *= cn;
3107
- AutoBuffer<WT> _buffer (src->cols * cn + std::max (src->cols * cn, dst->rows * cn));
3223
+ const int max_buffer_size = std::max (src->cols * cn, dst->rows * cn);
3224
+ AutoBuffer<WT> _buffer (2 * max_buffer_size);
3108
3225
const DecimateAlpha* xtab = xtab0;
3109
3226
int xtab_size = xtab_size0;
3110
- WT *buf = _buffer.data (), *sum = buf + src-> cols * cn ;
3227
+ WT *buf = _buffer.data (), *sum = buf + max_buffer_size ;
3111
3228
int j_start = tabofs[range.start ], j_end = tabofs[range.end ], j, dx;
3112
3229
3113
3230
static_assert (
@@ -3119,7 +3236,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
3119
3236
&& std::is_same<typename VTraits<VT>::lane_type, uint8_t >::value)
3120
3237
#endif
3121
3238
, " Lane type mismatch" );
3122
- int step = VT ().nlanes ;
3123
3239
cv::Mat tmp (ytab[j_end - 1 ].di - ytab[j_start].di + 1 , src->cols ,
3124
3240
CV_MAKETYPE (cv::DataType<WT>::type, cn));
3125
3241
int prev_di = -1 ;
@@ -3141,12 +3257,13 @@ class ResizeArea_Invoker : public ParallelLoopBody
3141
3257
{
3142
3258
row_start = 0 ;
3143
3259
row_end = xtab_size;
3144
- col_end = dst-> rows * cn;
3260
+ col_end = range. size () * cn;
3145
3261
start_di = xtab[0 ].di ;
3146
3262
}
3147
3263
prev_di = start_di;
3148
3264
for (dx = 0 ; dx < col_end; dx++) sum[dx] = (WT)0 ;
3149
3265
int di;
3266
+ const WT* buf_local;
3150
3267
for (j = row_start; j < row_end; ++j)
3151
3268
{
3152
3269
WT coeff;
@@ -3156,16 +3273,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3156
3273
coeff = ytab[j].alpha ;
3157
3274
di = ytab[j].di ;
3158
3275
si = ytab[j].si ;
3159
- // Convert the line to the proper float/double type.
3160
3276
const T* S = src->template ptr <T>(si);
3161
- std::copy (S, S + src->cols * cn, buf);
3277
+ // Convert the line to the proper float/double type.
3278
+ v_inter_area_copy_or_not (S, src->cols * cn, buf, &buf_local);
3162
3279
}
3163
3280
else
3164
3281
{
3165
3282
coeff = xtab[j].alpha ;
3166
3283
di = xtab[j].di / cn;
3167
3284
si = xtab[j].si / cn;
3168
- buf = tmp.template ptr <WT>(si);
3285
+ buf_local = tmp.template ptr <WT>(si);
3169
3286
}
3170
3287
const VT v_coeff = vx_setall_local<VT>(coeff);
3171
3288
@@ -3174,15 +3291,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3174
3291
int x;
3175
3292
WT* D = tmp.template ptr <WT>(prev_di - start_di);
3176
3293
for (x = 0 ; x < col_end; ++x) D[x] = sum[x];
3177
- v_inter_area_set_sum (step, col_end, buf , v_coeff, sum, x);
3178
- for (; x < col_end; ++x) sum[x] = buf [x] * coeff;
3294
+ v_inter_area_set_sum (col_end, buf_local , v_coeff, sum, x);
3295
+ for (; x < col_end; ++x) sum[x] = buf_local [x] * coeff;
3179
3296
prev_di = di;
3180
3297
}
3181
3298
else
3182
3299
{
3183
3300
int x;
3184
- v_inter_area_update_sum (step, col_end, buf, v_coeff, sum, x);
3185
- for (; x < col_end; ++x) sum[x] += buf[x] * coeff;
3301
+ assert (col_end <= max_buffer_size);
3302
+ v_inter_area_update_sum (col_end, buf_local, v_coeff, sum, x);
3303
+ for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
3186
3304
}
3187
3305
}
3188
3306
// Re-define di to please the compiler.
@@ -3203,11 +3321,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3203
3321
}
3204
3322
// Saturate_cast to dst.
3205
3323
for (int y = 0 ; y < tmp.rows ; ++y) {
3206
- T* D = dst->template ptr <T>(ytab[j_start].di + y);
3324
+ #if 0
3325
+ T* D = dst->template ptr<T>(ytab[j_start].di + y);
3207
3326
const WT* S = tmp.template ptr<WT>(y);
3208
3327
for(int x = 0; x < dst->cols * cn; ++x) {
3209
3328
D[x] = saturate_cast<T>(S[x]);
3210
3329
}
3330
+ #else
3331
+ v_inter_area_copy (tmp.template ptr <WT>(y), dst->cols * cn,
3332
+ dst->template ptr <T>(ytab[j_start].di + y));
3333
+ #endif
3211
3334
}
3212
3335
}
3213
3336
0 commit comments