8000 Speed up saturate_cast copies. · opencv/opencv@c5ffc7d · GitHub
[go: up one dir, main page]

Skip to content

Commit c5ffc7d

Browse files
committed
Speed up saturate_cast copies.
1 parent de855a1 commit c5ffc7d

File tree

1 file changed

+101
-19
lines changed

1 file changed

+101
-19
lines changed

modules/imgproc/src/resize.cpp

Lines changed: 101 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3023,31 +3023,107 @@ struct DecimateAlpha
30233023
float alpha;
30243024
};
30253025

3026+
namespace
3027+
{
3028+
static inline void vx_load_as(const uchar* ptr, v_float32& a)
3029+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
3030+
3031+
static inline void vx_load_as(const ushort* ptr, v_float32& a)
3032+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
3033+
3034+
static inline void vx_load_as(const short* ptr, v_float32& a)
3035+
{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
3036+
3037+
static inline void v_store_as(uchar* ptr, const v_float32& a)
3038+
{
3039+
constexpr int VECSZ = v_float32::nlanes;
3040+
float b[VECSZ];
3041+
v_store(b, a);
3042+
for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
8000 3043+
}
3044+
3045+
static inline void v_store_as(ushort* ptr, const v_float32& a)
3046+
{ v_pack_u_store(ptr, v_round(a)); }
3047+
3048+
static inline void v_store_as(short* ptr, const v_float32& a)
3049+
{ v_pack_store(ptr, v_round(a)); }
3050+
30263051
template <typename VT>
30273052
VT vx_setall_local(double coeff);
30283053
template <>
30293054
v_float32 vx_setall_local(double coeff) {
30303055
return v_setall_f32(coeff);
30313056
}
30323057
template <typename WT, typename VT>
3033-
void v_inter_area_set_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
3058+
void v_inter_area_set_sum(int col_end, const WT *const buf, const VT &v_coeff,
30343059
WT *sum, int &x) {
3060+
constexpr int step = VT::nlanes;
30353061
for (x = 0; x + step < col_end; x += step)
30363062
{
30373063
const VT line = vx_load(buf + x);
30383064
v_store(sum + x, line * v_coeff);
30393065
}
30403066
}
30413067
template <typename WT, typename VT>
3042-
void v_inter_area_update_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
3068+
void v_inter_area_update_sum(int col_end, const WT *const buf, const VT &v_coeff,
30433069
WT *sum, int &x) {
3070+
constexpr int step = VT::nlanes;
30443071
for (x = 0; x + step < col_end; x += step)
30453072
{
30463073
const VT line = vx_load(buf + x);
3047-
VT sum_x = vx_load(sum + x);
3074+
const VT sum_x = vx_load(sum + x);
30483075
v_store(sum + x, sum_x + line * v_coeff);
30493076
}
30503077
}
3078+
template <typename S>
3079+
void v_inter_area_copy_or_not(const S* s, int n, float *d, float const**buf)
3080+
{
3081+
static_assert(!std::is_same<S, float>::value, "Do not specialize for float");
3082+
constexpr int step = v_float32::nlanes;
3083+
int x = 0;
3084+
for(; x + step < n; x += step)
3085+
{
3086+
v_float32 a;
3087+
vx_load_as(s + x, a);
3088+
v_store(d + x, a);
3089+
}
3090+
for(; x < n; ++x) d[x] = saturate_cast<float>(s[x]);
3091+
*buf = d;
3092+
}
3093+
void v_inter_area_copy_or_not(const double* s, int n, double *d, const double **buf)
3094+
{
3095+
(void)n;
3096+
(void)d;
3097+
*buf = s;
3098+
}
3099+
void v_inter_area_copy_or_not(const float* s, int n, float *d, const float **buf)
3100+
{
3101+
(void)n;
3102+
(void)d;
3103+
*buf = s;
3104+
}
3105+
template <typename D>
3106+
void v_inter_area_copy(const float* s, int n, D *d)
3107+
{
3108+
static_assert(!std::is_same<float, D>::value, "Do not specialize for float");
3109+
constexpr int step = v_float32::nlanes;
3110+
int x = 0;
3111+
for(; x + step < n; x += step)
3112+
{
3113+
v_float32 a = vx_load(s + x);
3114+
v_store_as(d + x, a);
3115+
}
3116+
for(; x < n; ++x) d[x] = saturate_cast<D>(s[x]);
3117+
}
3118+
void v_inter_area_copy(const double* s, int n, double *d)
3119+
{
3120+
std::copy(s, s + n, d);
3121+
}
3122+
void v_inter_area_copy(const float* s, int n, float *d)
3123+
{
3124+
std::copy(s, s + n, d);
3125+
}
3126+
30513127
#if CV_SIMD128_64F
30523128
template <>
30533129
v_float64 vx_setall_local(double coeff) {
@@ -3060,26 +3136,25 @@ v_uint8 vx_setall_local(double coeff) {
30603136
return v_setall_u8(0);
30613137
}
30623138
template <>
3063-
void v_inter_area_set_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3139+
void v_inter_area_set_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
30643140
double *sum, int &x) {
3065-
(void)step;
30663141
(void)col_end;
30673142
(void)buf;
30683143
(void)v_coeff;
30693144
(void)sum;
30703145
x = 0;
30713146
}
30723147
template <>
3073-
void v_inter_area_update_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
3148+
void v_inter_area_update_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
30743149
double *sum, int &x) {
3075-
(void)step;
30763150
(void)col_end;
30773151
(void)buf;
30783152
(void)v_coeff;
30793153
(void)sum;
30803154
x = 0;
30813155
}
30823156
#endif
3157+
}
30833158

30843159
template <typename T, typename WT, typename VT>
30853160
class ResizeArea_Invoker : public ParallelLoopBody
@@ -3104,10 +3179,11 @@ class ResizeArea_Invoker : public ParallelLoopBody
31043179
Size dsize = dst->size();
31053180
int cn = dst->channels();
31063181
dsize.width *= cn;
3107-
AutoBuffer<WT> _buffer(src->cols * cn + std::max(src->cols * cn, dst->rows * cn));
3182+
const int max_buffer_size = std::max(src->cols * cn, dst->rows * cn);
3183+
AutoBuffer<WT> _buffer(2 * max_buffer_size);
31083184
const DecimateAlpha* xtab = xtab0;
31093185
int xtab_size = xtab_size0;
3110-
WT *buf = _buffer.data(), *sum = buf + src->cols * cn;
3186+
WT *buf = _buffer.data(), *sum = buf + max_buffer_size;
31113187
int j_start = tabofs[range.start], j_end = tabofs[range.end], j, dx;
31123188

31133189
static_assert(
@@ -3119,7 +3195,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
31193195
&& std::is_same<typename VTraits<VT>::lane_type, uint8_t>::value)
31203196
#endif
31213197
, "Lane type mismatch");
3122-
int step = VT().nlanes;
31233198
cv::Mat tmp(ytab[j_end - 1].di - ytab[j_start].di + 1, src->cols,
31243199
CV_MAKETYPE(cv::DataType<WT>::type, cn));
31253200
int prev_di = -1;
@@ -3141,12 +3216,13 @@ class ResizeArea_Invoker : public ParallelLoopBody
31413216
{
31423217
row_start = 0;
31433218
row_end = xtab_size;
3144-
col_end = dst->rows * cn;
3219+
col_end = range.size() * cn;
31453220
start_di = xtab[0].di;
31463221
}
31473222
prev_di = start_di;
31483223
for (dx = 0; dx < col_end; dx++) sum[dx] = (WT)0;
31493224
int di;
3225+
const WT* buf_local;
31503226
for (j = row_start; j < row_end; ++j)
31513227
{
31523228
WT coeff;
@@ -3156,16 +3232,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
31563232
coeff = ytab[j].alpha;
31573233
di = ytab[j].di;
31583234
si = ytab[j].si;
3159-
// Convert the line to the proper float/double type.
31603235
const T* S = src->template ptr<T>(si);
3161-
std::copy(S, S + src->cols * cn, buf);
3236+
// Convert the line to the proper float/double type.
3237+
v_inter_area_copy_or_not(S, src->cols * cn, buf, &buf_local);
31623238
}
31633239
else
31643240
{
31653241
coeff = xtab[j].alpha;
31663242
di = xtab[j].di / cn;
31673243
si = xtab[j].si / cn;
3168-
buf = tmp.template ptr<WT>(si);
3244+
buf_local = tmp.template ptr<WT>(si);
31693245
}
31703246
const VT v_coeff = vx_setall_local<VT>(coeff);
31713247

@@ -3174,15 +3250,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
31743250
int x;
31753251
WT* D = tmp.template ptr<WT>(prev_di - start_di);
31763252
for (x = 0; x < col_end; ++x) D[x] = sum[x];
3177-
v_inter_area_set_sum(step, col_end, buf, v_coeff, sum, x);
3178-
for (; x < col_end; ++x) sum[x] = buf[x] * coeff;
3253+
v_inter_area_set_sum(col_end, buf_local, v_coeff, sum, x);
3254+
for (; x < col_end; ++x) sum[x] = buf_local[x] * coeff;
31793255
prev_di = di;
31803256
}
31813257
else
31823258
{
31833259
int x;
3184-
v_inter_area_update_sum(step, col_end, buf, v_coeff, sum, x);
3185-
for (; x < col_end; ++x) sum[x] += buf[x] * coeff;
3260+
assert(col_end <= max_buffer_size);
3261+
v_inter_area_update_sum(col_end, buf_local, v_coeff, sum, x);
3262+
for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
31863263
}
31873264
}
31883265
// Re-define di to please the compiler.
@@ -3203,11 +3280,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
32033280
}
32043281
// Saturate_cast to dst.
32053282
for(int y = 0; y < tmp.rows; ++y) {
3206-
T* D = dst->template ptr<T>(ytab[j_start].di + y);
3283+
#if 0
3284+
T* D = dst->template ptr<T>(ytab[j_start].di + y);
32073285
const WT* S = tmp.template ptr<WT>(y);
32083286
for(int x = 0; x < dst->cols * cn; ++x) {
32093287
D[x] = saturate_cast<T>(S[x]);
32103288
}
3289+
#else
3290+
v_inter_area_copy(tmp.template ptr<WT>(y), dst->cols * cn,
3291+
dst->template ptr<T>(ytab[j_start].di + y));
3292+
#endif
32113293
}
32123294
}
32133295

0 commit comments

Comments
 (0)
0