8000 Speed up saturate_cast copies. · opencv/opencv@192bbe0 · GitHub
[go: up one dir, main page]

Skip to content

Commit 192bbe0

Browse files
committed
Speed up saturate_cast copies.
1 parent de855a1 commit 192bbe0

File tree

1 file changed

+142
-19
lines changed

1 file changed

+142
-19
lines changed

modules/imgproc/src/resize.cpp

Lines changed: 142 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3023,31 +3023,148 @@ struct DecimateAlpha
30233023
float alpha;
30243024
};
30253025

3026+
namespace
3027+
{
3028+
static inline void vx_load_as(const uchar* ptr, v_float32& a)
3029+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
3030+
3031+
static inline void vx_load_as(const schar* ptr, v_float32& a)
3032+
{ a = v_cvt_f32(vx_load_expand_q(ptr)); }
3033+
3034+
static inline void vx_load_as(const ushort* ptr, v_float32& a)
3035+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
3036+
3037+
static inline void vx_load_as(const short* ptr, v_float32& a)
3038+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
3039+
3040+
static inline void vx_load_as(const int* ptr, v_float32& a)
3041+
{ a = v_cvt_f32(vx_load(ptr)); }
3042+
3043+
static inline void vx_load_as(const float* ptr, v_float32& a)
3044+
{ a = vx_load(ptr); }
3045+
3046+
static inline void vx_load_as(const float16_t* ptr, v_float32& a)
3047+
{ a = vx_load_expand(ptr); }
3048+
3049+
static inline void v_store_as(uchar* ptr, const v_float32& a)
3050+
{
3051+
constexpr int VECSZ = v_float32::nlanes;
3052+
#if 0
3053+
ushort b[VECSZ];
3054+
v_pack_u_store(b, v_round(a));
3055+
for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
3056+
#else
3057+
float b[VECSZ];
3058+
v_store(b, a);
3059+
for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
3060+
#endif
3061+
}
3062+
3063+
static inline void v_store_as(schar* ptr, const v_float32& a)
3064+
{
3065+
constexpr int VECSZ = v_float32::nlanes;
3066+
#if 0
3067+
short b[VECSZ];
3068+
v_pack_store(b, v_round(a));
3069+
for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
3070+
#else
3071+
float b[VECSZ];
3072+
v_store(b, a);
3073+
for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
3074+
#endif
3075+
}
3076+
3077+
static inline void v_store_as(ushort* ptr, const v_float32& a)
3078+
{ v_pack_u_store(ptr, v_round(a)); }
3079+
3080+
static inline void v_store_as(short* ptr, const v_float32& a)
3081+
{ v_pack_store(ptr, v_round(a)); }
3082+
3083+
static inline void v_store_as(int* ptr, const v_float32& a)
3084+
{ v_store(ptr, v_round(a)); }
3085+
3086+
static inline void v_store_as(float* ptr, const v_float32& a)
3087+
{ v_store(ptr, a); }
3088+
3089+
static inline void v_store_as(float16_t* ptr, const v_float32& a)
3090+
{ v_pack_store(ptr, a); }
3091+
30263092
template <typename VT>
30273093
VT vx_setall_local(double coeff);
30283094
template <>
30293095
v_float32 vx_setall_local(double coeff) {
30303096
return v_setall_f32(coeff);
30313097
}
30323098
template <typename WT, typename VT>
3033-
void v_inter_area_set_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
3099+
void v_inter_area_set_sum(int col_end, const WT *const buf, const VT &v_coeff,
30343100
WT *sum, int &x) {
3101+
constexpr int step = VT::nlanes;
30353102
for (x = 0; x + step < col_end; x += step)
30363103
{
30373104
const VT line = vx_load(buf + x);
30383105
v_store(sum + x, line * v_coeff);
30393106
}
30403107
}
30413108
template <typename WT, typename VT>
3042-
void v_inter_area_update_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
3109+
void v_inter_area_update_sum(int col_end, const WT *const buf, const VT &v_coeff,
30433110
WT *sum, int &x) {
3111+
constexpr int step = VT::nlanes;
30443112
for (x = 0; x + step < col_end; x += step)
30453113
{
30463114
const VT line = vx_load(buf + x);
3047-
VT sum_x = vx_load(sum + x);
3115+
const VT sum_x = vx_load(sum + x);
30483116
v_store(sum + x, sum_x + line * v_coeff);
30493117
}
30503118
}
3119+
template <typename S>
3120+
void v_inter_area_copy_or_not(const S* s, int n, float *d, float const**buf)
3121+
{
3122+
static_assert(!std::is_same<S, float>::value, "Do not specialize for float");
3123+
constexpr int step = v_float32::nlanes;
3124+
int x = 0;
3125+
for(; x + step < n; x += step)
3126+
{
3127+
v_float32 a;
3128+
vx_load_as(s + x, a);
3129+
v_store(d + x, a);
3130+
}
3131+
for(; x < n; ++x) d[x] = saturate_cast<float>(s[x]);
3132+
*buf = d;
3133+
}
3134+
void v_inter_area_copy_or_not(const double* s, int n, double *d, const double **buf)
3135+
{
3136+
(void)n;
3137+
(void)d;
3138+
*buf = s;
3139+
}
3140+
void v_inter_area_copy_or_not(const float* s, int n, float *d, const float **buf)
3141+
{
3142+
(void)n;
3143+
(void)d;
3144+
*buf = s;
3145+
}
3146+
template <typename D>
3147+
void v_inter_area_copy(const float* s, int n, D *d)
3148+
{
3149+
static_assert(!std::is_same<float, D>::value, "Do not specialize for float");
3150+
constexpr int step = v_float32::nlanes;
3151+
int x = 0;
3152+
for(; x + step < n; x += step)
3153+
{
3154+
v_float32 a = vx_load(s + x);
3155+
v_store_as(d + x, a);
3156+
}
3157+
for(; x < n; ++x) d[x] = saturate_cast<D>(s[x]);
3158+
}
3159+
void v_inter_area_copy(const double* s, int n, double *d)
3160+
{
3161+
std::copy(s, s + n, d);
3162+
}
3163+
void v_inter_area_copy(const float* s, int n, float *d)
3164+
{
3165+
std::copy(s, s + n, d);
3166+
}
3167+
30513168
#if CV_SIMD128_64F
30523169
template <>
30533170
v_float64 vx_setall_local(double coeff) {
@@ -3060,26 +3177,25 @@ v_uint8 vx_setall_local(double coeff) {
30603177
return v_setall_u8(0);
30613178
}
30623179
template <>
3063-
void v_inter_area_set_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3180+
void v_inter_area_set_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
30643181
double *sum, int &x) {
3065-
(void)step;
30663182
(void)col_end;
30673183
(void)buf;
30683184
(void)v_coeff;
30693185
(void)sum;
30703186
x = 0;
30713187
}
30723188
template <>
3073-
void v_inter_area_update_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3189+
void v_inter_area_update_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
30743190
double *sum, int &x) {
3075-
(void)step;
30763191
(void)col_end;
30773192
(void)buf;
30783193
(void)v_coeff;
30793194
(void)sum;
30803195
x = 0;
30813196
}
30823197
#endif
3198+
}
30833199

30843200
template <typename T, typename WT, typename VT>
30853201
class ResizeArea_Invoker : public ParallelLoopBody
@@ -3104,10 +3220,11 @@ class ResizeArea_Invoker : public ParallelLoopBody
31043220
Size dsize = dst->size();
31053221
int cn = dst->channels();
31063222
dsize.width *= cn;
3107-
AutoBuffer<WT> _buffer(src->cols * cn + std::max(src->cols * cn, dst->rows * cn));
3223+
const int max_buffer_size = std::max(src->cols * cn, dst->rows * cn);
3224+
AutoBuffer<WT> _buffer(2 * max_buffer_size);
31083225
const DecimateAlpha* xtab = xtab0;
31093226
int xtab_size = xtab_size0;
3110-
WT *buf = _buffer.data(), *sum = buf + src->cols * cn;
3227+
WT *buf = _buffer.data(), *sum = buf + max_buffer_size;
31113228
int j_start = tabofs[range.start], j_end = tabofs[range.end], j, dx;
31123229

31133230
static_assert(
@@ -3119,7 +3236,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
31193236
&& std::is_same<typename VTraits<VT>::lane_type, uint8_t>::value)
31203237
#endif
31213238
, "Lane type mismatch");
3122-
int step = VT().nlanes;
31233239
cv::Mat tmp(ytab[j_end - 1].di - ytab[j_start].di + 1, src->cols,
31243240
CV_MAKETYPE(cv::DataType<WT>::type, cn));
31253241
int prev_di = -1;
@@ -3141,12 +3257,13 @@ class ResizeArea_Invoker : public ParallelLoopBody
31413257
{
31423258
row_start = 0;
31433259
row_end = xtab_size;
3144-
col_end = dst->rows * cn;
3260+
col_end = range.size() * cn;
31453261
start_di = xtab[0].di;
31463262
}
31473263
prev_di = start_di;
31483264
for (dx = 0; dx < col_end; dx++) sum[dx] = (WT)0;
31493265
int di;
3266+
const WT* buf_local;
31503267
for (j = row_start; j < row_end; ++j)
31513268
{
31523269
WT coeff;
@@ -3156,16 +3273,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
31563273
coeff = ytab[j].alpha;
31573274
di = ytab[j].di;
31583275
si = ytab[j].si;
3159-
// Convert the line to the proper float/double type.
31603276
const T* S = src->template ptr<T>(si);
3161-
std::copy(S, S + src->cols * cn, buf);
3277+
// Convert the line to the proper float/double type.
3278+
v_inter_area_copy_or_not(S, src->cols * cn, buf, &buf_local);
31623279
}
31633280
else
31643281
{
31653282
coeff = xtab[j].alpha;
31663283
di = xtab[j].di / cn;
31673284
si = xtab[j].si / cn;
3168-
buf = tmp.template ptr<WT>(si);
3285+
buf_local = tmp.template ptr<WT>(si);
31693286
}
31703287
const VT v_coeff = vx_setall_local<VT>(coeff);
31713288

@@ -3174,15 +3291,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
31743291
int x;
31753292
WT* D = tmp.template ptr<WT>(prev_di - start_di);
31763293
for (x = 0; x < col_end; ++x) D[x] = sum[x];
3177-
v_inter_area_set_sum(step, col_end, buf, v_coeff, sum, x);
3178-
for (; x < col_end; ++x) sum[x] = buf[x] * coeff;
3294+
v_inter_area_set_sum(col_end, buf_local, v_coeff, sum, x);
3295+
for (; x < col_end; ++x) sum[x] = buf_local[x] * coeff;
31793296
prev_di = di;
31803297
}
31813298
else
31823299
{
31833300
int x;
3184-
v_inter_area_update_sum(step, col_end, buf, v_coeff, sum, x);
3185-
for (; x < col_end; ++x) sum[x] += buf[x] * coeff;
3301+
assert(col_end <= max_buffer_size);
3302+
v_inter_area_update_sum(col_end, buf_local, v_coeff, sum, x);
3303+
for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
31863304
}
31873305
}
31883306
// Re-define di to please the compiler.
@@ -3203,11 +3321,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
32033321
}
32043322
// Saturate_cast to dst.
32053323
for(int y = 0; y < tmp.rows; ++y) {
3206-
T* D = dst->template ptr<T>(ytab[j_start].di + y);
3324+
#if 0
3325+
T* D = dst->template ptr<T>(ytab[j_start].di + y);
32073326
const WT* S = tmp.template ptr<WT>(y);
32083327
for(int x = 0; x < dst->cols * cn; ++x) {
32093328
D[x] = saturate_cast<T>(S[x]);
32103329
}
3330+
#else
3331+
v_inter_area_copy(tmp.template ptr<WT>(y), dst->cols * cn,
3332+
dst->template ptr<T>(ytab[j_start].di + y));
3333+
#endif
32113334
}
32123335
}
32133336

0 commit comments

Comments
 (0)
0