8000 Merge pull request #24412 from vrabaud:inter_area1 · thewoz/opencv@ae66990 · GitHub
[go: up one dir, main page]

Skip to content

Commit ae66990

Browse files
vrabaudthewoz
authored andcommitted
Merge pull request opencv#24412 from vrabaud:inter_area1
Speed up line merging in INTER_AREA opencv#24412 This provides a 10 to 20% speed-up. Related perf test fix: opencv#24417 This is a split of opencv#23525 that will be updated to only deal with column merging. ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
1 parent b26398a commit ae66990

File tree

1 file changed

+109
-14
lines changed

1 file changed

+109
-14
lines changed

modules/imgproc/src/resize.cpp

Lines changed: 109 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3019,6 +3019,111 @@ struct DecimateAlpha
30193019
};
30203020

30213021

3022+
namespace inter_area {
3023+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3024+
inline void saturate_store(const float* src, uchar* dst) {
3025+
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
3026+
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
3027+
const v_int32 tmp2 = v_round(vx_load(src + 2 * VTraits<v_float32>::vlanes()));
3028+
const v_int32 tmp3 = v_round(vx_load(src + 3 * VTraits<v_float32>::vlanes()));
3029+
v_store(dst, v_pack(v_pack_u(tmp0, tmp1), v_pack_u(tmp2, tmp3)));
3030+
}
3031+
3032+
inline void saturate_store(const float* src, ushort* dst) {
3033+
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
3034+
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
3035+
v_store(dst, v_pack_u(tmp0, tmp1));
3036+
}
3037+
3038+
inline void saturate_store(const float* src, short* dst) {
3039+
const v_int32 tmp0 = v_round(vx_load(src + 0 * VTraits<v_float32>::vlanes()));
3040+
const v_int32 tmp1 = v_round(vx_load(src + 1 * VTraits<v_float32>::vlanes()));
3041+
v_store(dst, v_pack(tmp0, tmp1));
3042+
}
3043+
3044+
static inline v_float32 vx_setall(float coeff) { return vx_setall_f32(coeff); }
3045+
3046+
template <typename T>
3047+
struct VArea {};
3048+
3049+
template <>
3050+
struct VArea<float> {
3051+
typedef v_float32 vWT;
3052+
};
3053+
#endif
3054+
3055+
#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F)
3056+
static inline v_float64 vx_setall(double coeff) { return vx_setall_f64(coeff); }
3057+
3058+
template <>
3059+
struct VArea<double> {
3060+
typedef v_float64 vWT;
3061+
};
3062+
3063+
#else
3064+
inline void mul(const double* buf, int width, double beta, double* sum) {
3065+
for (int dx = 0; dx < width; ++dx) {
3066+
sum[dx] = beta * buf[dx];
3067+
}
3068+
}
3069+
3070+
inline void muladd(const double* buf, int width, double beta, double* sum) {
3071+
for (int dx = 0; dx < width; ++dx) {
3072+
sum[dx] += beta * buf[dx];
3073+
}
3074+
}
3075+
#endif
3076+
3077+
template <typename T, typename WT>
3078+
inline void saturate_store(const WT* sum, int width, T* D) {
3079+
int dx = 0;
3080+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3081+
const int step = VTraits<typename VArea<WT>::vWT>::vlanes() * sizeof(WT) / sizeof(T);
3082+
for (; dx + step < width; dx += step) {
30 10000 83+
saturate_store(sum + dx, D + dx);
3084+
}
3085+
#endif
3086+
for (; dx < width; ++dx) {
3087+
D[dx] = saturate_cast<T>(sum[dx]);
3088+
}
3089+
}
3090+
3091+
// Optimization when T == WT.
3092+
template <typename WT>
3093+
inline void saturate_store(const WT* sum, int width, WT* D) {
3094+
std::copy(sum, sum + width, D);
3095+
}
3096+
3097+
template <typename WT>
3098+
inline void mul(const WT* buf, int width, WT beta, WT* sum) {
3099+
int dx = 0;
3100+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3101+
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
3102+
for (; dx + step < width; dx += step) {
3103+
vx_store(sum + dx, v_mul(vx_setall(beta), vx_load(buf + dx)));
3104+
}
3105+
#endif
3106+
for (; dx < width; ++dx) {
3107+
sum[dx] = beta * buf[dx];
3108+
}
3109+
}
3110+
3111+
template <typename WT>
3112+
inline void muladd(const WT* buf, int width, WT beta, WT* sum) {
3113+
int dx = 0;
3114+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3115+
const int step = VTraits<typename VArea<WT>::vWT>::vlanes();
3116+
for (; dx + step < width; dx += step) {
3117+
vx_store(sum + dx, v_add(vx_load(sum + dx), v_mul(vx_setall(beta), vx_load(buf + dx))));
3118+
}
3119+
#endif
3120+
for (; dx < width; ++dx) {
3121+
sum[dx] += beta * buf[dx];
3122+
}
3123+
}
3124+
3125+
} // namespace inter_area
3126+
30223127
template<typename T, typename WT> class ResizeArea_Invoker :
30233128
public ParallelLoopBody
30243129
{
@@ -3120,27 +3225,17 @@ template<typename T, typename WT> class ResizeArea_Invoker :
31203225

31213226
if( dy != prev_dy )
31223227
{
3123-
T* D = dst->template ptr<T>(prev_dy);
3124-
3125-
for( dx = 0; dx < dsize.width; dx++ )
3126-
{
3127-
D[dx] = saturate_cast<T>(sum[dx]);
3128-
sum[dx] = beta*buf[dx];
3129-
}
3228+
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
3229+
inter_area::mul(buf, dsize.width, beta, sum);
31303230
prev_dy = dy;
31313231
}
31323232
else
31333233
{
3134-
for( dx = 0; dx < dsize.width; dx++ )
3135-
sum[dx] += beta*buf[dx];
3234+
inter_area::muladd(buf, dsize.width, beta, sum);
31363235
}
31373236
}
31383237

3139-
{
3140-
T* D = dst->template ptr<T>(prev_dy);
3141-
for( dx = 0; dx < dsize.width; dx++ )
3142-
D[dx] = saturate_cast<T>(sum[dx]);
3143-
}
3238+
inter_area::saturate_store(sum, dsize.width, dst->template ptr<T>(prev_dy));
31443239
}
31453240

31463241
private:

0 commit comments

Comments
 (0)
0