@@ -3023,31 +3023,107 @@ struct DecimateAlpha
3023
3023
float alpha;
3024
3024
};
3025
3025
3026
+ namespace
3027
+ {
3028
+ static inline void vx_load_as (const uchar* ptr, v_float32& a)
3029
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand_q (ptr))); }
3030
+
3031
+ static inline void vx_load_as (const ushort* ptr, v_float32& a)
3032
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (ptr))); }
3033
+
3034
+ static inline void vx_load_as (const short * ptr, v_float32& a)
3035
+ { a = v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (ptr))); }
3036
+
3037
+ static inline void v_store_as (uchar* ptr, const v_float32& a)
3038
+ {
3039
+ constexpr int VECSZ = v_float32::nlanes;
3040
+ float b[VECSZ];
3041
+ v_store (b, a);
3042
+ for ( int i = 0 ; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
8000
3043
+ }
3044
+
3045
+ static inline void v_store_as (ushort* ptr, const v_float32& a)
3046
+ { v_pack_u_store (ptr, v_round (a)); }
3047
+
3048
+ static inline void v_store_as (short * ptr, const v_float32& a)
3049
+ { v_pack_store (ptr, v_round (a)); }
3050
+
3026
3051
template <typename VT>
3027
3052
VT vx_setall_local (double coeff);
3028
3053
template <>
3029
3054
v_float32 vx_setall_local (double coeff) {
3030
3055
return v_setall_f32 (coeff);
3031
3056
}
3032
3057
template <typename WT, typename VT>
3033
- void v_inter_area_set_sum (int step, int col_end, const WT *const buf, const VT &v_coeff,
3058
+ void v_inter_area_set_sum (int col_end, const WT *const buf, const VT &v_coeff,
3034
3059
WT *sum, int &x) {
3060
+ constexpr int step = VT::nlanes;
3035
3061
for (x = 0 ; x + step < col_end; x += step)
3036
3062
{
3037
3063
const VT line = vx_load (buf + x);
3038
3064
v_store (sum + x, line * v_coeff);
3039
3065
}
3040
3066
}
3041
3067
template <typename WT, typename VT>
3042
- void v_inter_area_update_sum (int step, int col_end, const WT *const buf, const VT &v_coeff,
3068
+ void v_inter_area_update_sum (int col_end, const WT *const buf, const VT &v_coeff,
3043
3069
WT *sum, int &x) {
3070
+ constexpr int step = VT::nlanes;
3044
3071
for (x = 0 ; x + step < col_end; x += step)
3045
3072
{
3046
3073
const VT line = vx_load (buf + x);
3047
- VT sum_x = vx_load (sum + x);
3074
+ const VT sum_x = vx_load (sum + x);
3048
3075
v_store (sum + x, sum_x + line * v_coeff);
3049
3076
}
3050
3077
}
3078
+ template <typename S>
3079
+ void v_inter_area_copy_or_not (const S* s, int n, float *d, float const **buf)
3080
+ {
3081
+ static_assert (!std::is_same<S, float >::value, " Do not specialize for float" );
3082
+ constexpr int step = v_float32::nlanes;
3083
+ int x = 0 ;
3084
+ for (; x + step < n; x += step)
3085
+ {
3086
+ v_float32 a;
3087
+ vx_load_as (s + x, a);
3088
+ v_store (d + x, a);
3089
+ }
3090
+ for (; x < n; ++x) d[x] = saturate_cast<float >(s[x]);
3091
+ *buf = d;
3092
+ }
3093
+ void v_inter_area_copy_or_not (const double * s, int n, double *d, const double **buf)
3094
+ {
3095
+ (void )n;
3096
+ (void )d;
3097
+ *buf = s;
3098
+ }
3099
+ void v_inter_area_copy_or_not (const float * s, int n, float *d, const float **buf)
3100
+ {
3101
+ (void )n;
3102
+ (void )d;
3103
+ *buf = s;
3104
+ }
3105
+ template <typename D>
3106
+ void v_inter_area_copy (const float * s, int n, D *d)
3107
+ {
3108
+ static_assert (!std::is_same<float , D>::value, " Do not specialize for float" );
3109
+ constexpr int step = v_float32::nlanes;
3110
+ int x = 0 ;
3111
+ for (; x + step < n; x += step)
3112
+ {
3113
+ v_float32 a = vx_load (s + x);
3114
+ v_store_as (d + x, a);
3115
+ }
3116
+ for (; x < n; ++x) d[x] = saturate_cast<D>(s[x]);
3117
+ }
3118
+ void v_inter_area_copy (const double * s, int n, double *d)
3119
+ {
3120
+ std::copy (s, s + n, d);
3121
+ }
3122
+ void v_inter_area_copy (const float * s, int n, float *d)
3123
+ {
3124
+ std::copy (s, s + n, d);
3125
+ }
3126
+
3051
3127
#if CV_SIMD128_64F
3052
3128
template <>
3053
3129
v_float64 vx_setall_local (double coeff) {
@@ -3060,26 +3136,25 @@ v_uint8 vx_setall_local(double coeff) {
3060
3136
return v_setall_u8 (0 );
3061
3137
}
3062
3138
template <>
3063
- void v_inter_area_set_sum (int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3139
+ void v_inter_area_set_sum (int col_end, const double *const buf, const v_uint8 &v_coeff,
3064
3140
double *sum, int &x) {
3065
- (void )step;
3066
3141
(void )col_end;
3067
3142
(void )buf;
3068
3143
(void )v_coeff;
3069
3144
(void )sum;
3070
3145
x = 0 ;
3071
3146
}
3072
3147
template <>
3073
- void v_inter_area_update_sum (int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3148
+ void v_inter_area_update_sum (int col_end, const double *const buf, const v_uint8 &v_coeff,
3074
3149
double *sum, int &x) {
3075
- (void )step;
3076
3150
(void )col_end;
3077
3151
(void )buf;
3078
3152
(void )v_coeff;
3079
3153
(void )sum;
3080
3154
x = 0 ;
3081
3155
}
3082
3156
#endif
3157
+ }
3083
3158
3084
3159
template <typename T, typename WT, typename VT>
3085
3160
class ResizeArea_Invoker : public ParallelLoopBody
@@ -3104,10 +3179,11 @@ class ResizeArea_Invoker : public ParallelLoopBody
3104
3179
Size dsize = dst->size ();
3105
3180
int cn = dst->channels ();
3106
3181
dsize.width *= cn;
3107
- AutoBuffer<WT> _buffer (src->cols * cn + std::max (src->cols * cn, dst->rows * cn));
3182
+ const int max_buffer_size = std::max (src->cols * cn, dst->rows * cn);
3183
+ AutoBuffer<WT> _buffer (2 * max_buffer_size);
3108
3184
const DecimateAlpha* xtab = xtab0;
3109
3185
int xtab_size = xtab_size0;
3110
- WT *buf = _buffer.data (), *sum = buf + src-> cols * cn ;
3186
+ WT *buf = _buffer.data (), *sum = buf + max_buffer_size ;
3111
3187
int j_start = tabofs[range.start ], j_end = tabofs[range.end ], j, dx;
3112
3188
3113
3189
static_assert (
@@ -3119,7 +3195,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
3119
3195
&& std::is_same<typename VTraits<VT>::lane_type, uint8_t >::value)
3120
3196
#endif
3121
3197
, " Lane type mismatch" );
3122
- int step = VT ().nlanes ;
3123
3198
cv::Mat tmp (ytab[j_end - 1 ].di - ytab[j_start].di + 1 , src->cols ,
3124
3199
CV_MAKETYPE (cv::DataType<WT>::type, cn));
3125
3200
int prev_di = -1 ;
@@ -3141,12 +3216,13 @@ class ResizeArea_Invoker : public ParallelLoopBody
3141
3216
{
3142
3217
row_start = 0 ;
3143
3218
row_end = xtab_size;
3144
- col_end = dst-> rows * cn;
3219
+ col_end = range. size () * cn;
3145
3220
start_di = xtab[0 ].di ;
3146
3221
}
3147
3222
prev_di = start_di;
3148
3223
for (dx = 0 ; dx < col_end; dx++) sum[dx] = (WT)0 ;
3149
3224
int di;
3225
+ const WT* buf_local;
3150
3226
for (j = row_start; j < row_end; ++j)
3151
3227
{
3152
3228
WT coeff;
@@ -3156,16 +3232,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3156
3232
coeff = ytab[j].alpha ;
3157
3233
di = ytab[j].di ;
3158
3234
si = ytab[j].si ;
3159
- // Convert the line to the proper float/double type.
3160
3235
const T* S = src->template ptr <T>(si);
3161
- std::copy (S, S + src->cols * cn, buf);
3236
+ // Convert the line to the proper float/double type.
3237
+ v_inter_area_copy_or_not (S, src->cols * cn, buf, &buf_local);
3162
3238
}
3163
3239
else
3164
3240
{
3165
3241
coeff = xtab[j].alpha ;
3166
3242
di = xtab[j].di / cn;
3167
3243
si = xtab[j].si / cn;
3168
- buf = tmp.template ptr <WT>(si);
3244
+ buf_local = tmp.template ptr <WT>(si);
3169
3245
}
3170
3246
const VT v_coeff = vx_setall_local<VT>(coeff);
3171
3247
@@ -3174,15 +3250,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3174
3250
int x;
3175
3251
WT* D = tmp.template ptr <WT>(prev_di - start_di);
3176
3252
for (x = 0 ; x < col_end; ++x) D[x] = sum[x];
3177
- v_inter_area_set_sum (step, col_end, buf , v_coeff, sum, x);
3178
- for (; x < col_end; ++x) sum[x] = buf [x] * coeff;
3253
+ v_inter_area_set_sum (col_end, buf_local , v_coeff, sum, x);
3254
+ for (; x < col_end; ++x) sum[x] = buf_local [x] * coeff;
3179
3255
prev_di = di;
3180
3256
}
3181
3257
else
3182
3258
{
3183
3259
int x;
3184
- v_inter_area_update_sum (step, col_end, buf, v_coeff, sum, x);
3185
- for (; x < col_end; ++x) sum[x] += buf[x] * coeff;
3260
+ assert (col_end <= max_buffer_size);
3261
+ v_inter_area_update_sum (col_end, buf_local, v_coeff, sum, x);
3262
+ for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
3186
3263
}
3187
3264
}
3188
3265
// Re-define di to please the compiler.
@@ -3203,11 +3280,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
3203
3280
}
3204
3281
// Saturate_cast to dst.
3205
3282
for (int y = 0 ; y < tmp.rows ; ++y) {
3206
- T* D = dst->template ptr <T>(ytab[j_start].di + y);
3283
+ #if 0
3284
+ T* D = dst->template ptr<T>(ytab[j_start].di + y);
3207
3285
const WT* S = tmp.template ptr<WT>(y);
3208
3286
for(int x = 0; x < dst->cols * cn; ++x) {
3209
3287
D[x] = saturate_cast<T>(S[x]);
3210
3288
}
3289
+ #else
3290
+ v_inter_area_copy (tmp.template ptr <WT>(y), dst->cols * cn,
3291
+ dst->template ptr <T>(ytab[j_start].di + y));
3292
+ #endif
3211
3293
}
3212
3294
}
3213
3295
0 commit comments