opencv
diff --git a/‎modules/imgproc/src/resize.cpp
Lines changed: 142 additions & 19 deletions b/‎modules/imgproc/src/resize.cpp
Lines changed: 142 additions & 19 deletions
@@ -3023,31 +3023,148 @@ struct DecimateAlpha
     float alpha;
 };
 
+namespace
+{
+static inline void vx_load_as(const uchar* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(ptr))); }
+
+static inline void vx_load_as(const schar* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load_expand_q(ptr)); }
+
+static inline void vx_load_as(const ushort* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const short* ptr, v_float32& a)
+{ a = v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(ptr))); }
+
+static inline void vx_load_as(const int* ptr, v_float32& a)
+{ a = v_cvt_f32(vx_load(ptr)); }
+
+static inline void vx_load_as(const float* ptr, v_float32& a)
+{ a = vx_load(ptr); }
+
+static inline void vx_load_as(const float16_t* ptr, v_float32& a)
+{ a = vx_load_expand(ptr); }
+
+static inline void v_store_as(uchar* ptr, const v_float32& a)
+{
+constexpr int VECSZ = v_float32::nlanes;
+#if 0
+ushort b[VECSZ];
+v_pack_u_store(b, v_round(a));
+for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
+#else
+float b[VECSZ];
+v_store(b, a);
+for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<uchar>(b[i]);
+#endif
+}
+
+static inline void v_store_as(schar* ptr, const v_float32& a)
+{
+constexpr int VECSZ = v_float32::nlanes;
+#if 0
+short b[VECSZ];
+v_pack_store(b, v_round(a));
+for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
+#else
+float b[VECSZ];
+v_store(b, a);
+for( int i = 0; i < VECSZ; ++i) ptr[i] = saturate_cast<schar>(b[i]);
+#endif
+}
+
+static inline void v_store_as(ushort* ptr, const v_float32& a)
+{ v_pack_u_store(ptr, v_round(a)); }
+
+static inline void v_store_as(short* ptr, const v_float32& a)
+{ v_pack_store(ptr, v_round(a)); }
+
+static inline void v_store_as(int* ptr, const v_float32& a)
+{ v_store(ptr, v_round(a)); }
+
+static inline void v_store_as(float* ptr, const v_float32& a)
+{ v_store(ptr, a); }
+
+static inline void v_store_as(float16_t* ptr, const v_float32& a)
+{ v_pack_store(ptr, a); }
+
 template <typename VT>
 VT vx_setall_local(double coeff);
 template <>
 v_float32 vx_setall_local(double coeff) {
     return v_setall_f32(coeff);
 }
 template <typename WT, typename VT>
-void v_inter_area_set_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
+void v_inter_area_set_sum(int col_end, const WT *const buf, const VT &v_coeff,
                           WT *sum, int &x) {
+    constexpr int step = VT::nlanes;
     for (x = 0; x + step < col_end; x += step)
     {
         const VT line = vx_load(buf + x);
         v_store(sum + x, line * v_coeff);
     }
 }
 template <typename WT, typename VT>
-void v_inter_area_update_sum(int step, int col_end, const WT *const buf, const VT &v_coeff,
+void v_inter_area_update_sum(int col_end, const WT *const buf, const VT &v_coeff,
                              WT *sum, int &x) {
+    constexpr int step = VT::nlanes;
     for (x = 0; x + step < col_end; x += step)
     {
         const VT line = vx_load(buf + x);
-        VT sum_x = vx_load(sum + x);
+        const VT sum_x = vx_load(sum + x);
         v_store(sum + x, sum_x + line * v_coeff);
     }
 }
+template <typename S>
+void v_inter_area_copy_or_not(const S* s, int n, float *d, float const**buf)
+{
+    static_assert(!std::is_same<S, float>::value, "Do not specialize for float");
+    constexpr int step = v_float32::nlanes;
+    int x = 0;
+    for(; x + step < n; x += step)
+    {
+        v_float32 a;
+        vx_load_as(s + x, a);
+        v_store(d + x, a);
+    }
+    for(; x < n; ++x) d[x] = saturate_cast<float>(s[x]);
+    *buf = d;
+}
+void v_inter_area_copy_or_not(const double* s, int n, double *d, const double **buf)
+{
+    (void)n;
+    (void)d;
+    *buf = s;
+}
+void v_inter_area_copy_or_not(const float* s, int n, float *d, const float **buf)
+{
+    (void)n;
+    (void)d;
+    *buf = s;
+}
+template <typename D>
+void v_inter_area_copy(const float* s, int n, D *d)
+{
+    static_assert(!std::is_same<float, D>::value, "Do not specialize for float");
+    constexpr int step = v_float32::nlanes;
+    int x = 0;
+    for(; x + step < n; x += step)
+    {
+        v_float32 a = vx_load(s + x);
+        v_store_as(d + x, a);
+    }
+    for(; x < n; ++x) d[x] = saturate_cast<D>(s[x]);
+}
+void v_inter_area_copy(const double* s, int n, double *d)
+{
+    std::copy(s, s + n, d);
+}
+void v_inter_area_copy(const float* s, int n, float *d)
+{
+    std::copy(s, s + n, d);
+}
+
 #if CV_SIMD128_64F
 template <>
 v_float64 vx_setall_local(double coeff) {
@@ -3060,26 +3177,25 @@ v_uint8 vx_setall_local(double coeff) {
     return v_setall_u8(0);
 }
 template <>
-void v_inter_area_set_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
+void v_inter_area_set_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
                           double *sum, int &x) {
-    (void)step;
     (void)col_end;
     (void)buf;
     (void)v_coeff;
     (void)sum;
     x = 0;
 }
 template <>
-void v_inter_area_update_sum(int step, int col_end, const double *const buf, const v_uint8 &v_coeff,
+void v_inter_area_update_sum(int col_end, const double *const buf, const v_uint8 &v_coeff,
                              double *sum, int &x) {
-    (void)step;
     (void)col_end;
     (void)v_coeff;
     (void)sum;
     x = 0;
 }
 #endif
+}
 
 template <typename T, typename WT, typename VT>
 class ResizeArea_Invoker : public ParallelLoopBody
@@ -3104,10 +3220,11 @@ class ResizeArea_Invoker : public ParallelLoopBody
         Size dsize = dst->size();
         int cn = dst->channels();
         dsize.width *= cn;
-        AutoBuffer<WT> _buffer(src->cols * cn + std::max(src->cols * cn, dst->rows * cn));
+        const int max_buffer_size = std::max(src->cols * cn, dst->rows * cn);
+        AutoBuffer<WT> _buffer(2 * max_buffer_size);
         const DecimateAlpha* xtab = xtab0;
         int xtab_size = xtab_size0;
-        WT *buf = _buffer.data(), *sum = buf + src->cols * cn;
+        WT *buf = _buffer.data(), *sum = buf + max_buffer_size;
         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, dx;
 
         static_assert(
@@ -3119,7 +3236,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
                       && std::is_same<typename VTraits<VT>::lane_type, uint8_t>::value)
 #endif
                       , "Lane type mismatch");
-        int step = VT().nlanes;
         cv::Mat tmp(ytab[j_end - 1].di - ytab[j_start].di + 1, src->cols,
                     CV_MAKETYPE(cv::DataType<WT>::type, cn));
         int prev_di = -1;
@@ -3141,12 +3257,13 @@ class ResizeArea_Invoker : public ParallelLoopBody
             {
                 row_start = 0;
                 row_end = xtab_size;
-                col_end = dst->rows * cn;
+                col_end = range.size() * cn;
                 start_di = xtab[0].di;
             }
             prev_di = start_di;
             for (dx = 0; dx < col_end; dx++) sum[dx] = (WT)0;
             int di;
+            const WT* buf_local;
             for (j = row_start; j < row_end; ++j)
             {
                 WT coeff;
@@ -3156,16 +3273,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
                     coeff = ytab[j].alpha;
                     di = ytab[j].di;
                     si = ytab[j].si;
-                    // Convert the line to the proper float/double type.
                     const T* S = src->template ptr<T>(si);
-                    std::copy(S, S + src->cols * cn, buf);
+                    // Convert the line to the proper float/double type.
+                    v_inter_area_copy_or_not(S, src->cols * cn, buf, &buf_local);
                 }
                 else
                 {
                     coeff = xtab[j].alpha;
                     di = xtab[j].di / cn;
                     si = xtab[j].si / cn;
-                    buf = tmp.template ptr<WT>(si);
+                    buf_local = tmp.template ptr<WT>(si);
                 }
                 const VT v_coeff = vx_setall_local<VT>(coeff);
 
@@ -3174,15 +3291,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
                     int x;
                     WT* D = tmp.template ptr<WT>(prev_di - start_di);
                     for (x = 0; x < col_end; ++x) D[x] = sum[x];
-                    v_inter_area_set_sum(step, col_end, buf, v_coeff, sum, x);
-                    for (; x < col_end; ++x) sum[x] = buf[x] * coeff;
+                    v_inter_area_set_sum(col_end, buf_local, v_coeff, sum, x);
+                    for (; x < col_end; ++x) sum[x] = buf_local[x] * coeff;
                     prev_di = di;
                 }
                 else
                 {
                     int x;
-                    v_inter_area_update_sum(step, col_end, buf, v_coeff, sum, x);
-                    for (; x < col_end; ++x) sum[x] += buf[x] * coeff;
+                    assert(col_end <= max_buffer_size);
+                    v_inter_area_update_sum(col_end, buf_local, v_coeff, sum, x);
+                    for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
                 }
             }
             // Re-define di to please the compiler.
@@ -3203,11 +3321,16 @@ class ResizeArea_Invoker : public ParallelLoopBody
         }
         // Saturate_cast to dst.
         for(int y = 0; y < tmp.rows; ++y) {
-            T* D = dst->template ptr<T>(ytab[j_start].di + y);
+#if 0
+T* D = dst->template ptr<T>(ytab[j_start].di + y);
             const WT* S = tmp.template ptr<WT>(y);
                 D[x] = saturate_cast<T>(S[x]);
             }
+#else
+            v_inter_area_copy(tmp.template ptr<WT>(y), dst->cols * cn,
+                              dst->template ptr<T>(ytab[j_start].di + y));
+#endif
         }
     }