Merge branch '2.4'

5bc104ce · Andrey Kamaev · 0e7ca71d · 09abcd56 · 5bc104ce · 5bc104ce
9 changed file
--- a/doc/_themes/blue/layout.html
+++ b/doc/_themes/blue/layout.html
@@ -188,7 +188,7 @@
                  {% if theme_lang == 'py' %}
                    <li>Try the <a href="cookbook.html">Cookbook</a>.</li>
                  {% endif %}
-                  <li>Ask a question in the <a href="http://tech.groups.yahoo.com/group/OpenCV/">user group/mailing list</a>.</li>
+                  <li>Ask a question on the <a href="http://answers.opencv.org">Q&A forum</a>.</li>
                  <li>If you think something is missing or wrong in the documentation,
                  please file a <a href="http://code.opencv.org">bug report</a>.</li>
              </ul>

--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -1218,28 +1218,21 @@ namespace cv

 static int actualScalarDepth(const double* data, int len)
 {
-    double minval = data[0];
-    double maxval = data[0];
-    for(int i = 1; i < len; ++i)
-    {
-        minval = MIN(minval, data[i]);
-        maxval = MAX(maxval, data[i]);
-    }
-
-    int depth = CV_64F;
-    if(minval >= 0 && maxval <= UCHAR_MAX)
-        depth = CV_8U;
-    else if(minval >= SCHAR_MIN && maxval <= SCHAR_MAX)
-        depth = CV_8S;
-    else if(minval >= 0 && maxval <= USHRT_MAX)
-        depth = CV_16U;
-    else if(minval >= SHRT_MIN && maxval <= SHRT_MAX)
-        depth = CV_16S;
-    else if(minval >= INT_MIN && maxval <= INT_MAX)
-        depth = CV_32S;
-    else if(minval >= -FLT_MAX && maxval <= FLT_MAX)
-        depth = CV_32F;
-    return depth;
+    int i = 0, minval = INT_MAX, maxval = INT_MIN;
+    for(; i < len; ++i)
+    {
+        int ival = cvRound(data[i]);
+        if( ival != data[i] )
+            break;
+        minval = MIN(minval, ival);
+        maxval = MAX(maxval, ival);
+    }
+    return i < len ? CV_64F :
+        minval >= 0 && maxval <= UCHAR_MAX ? CV_8U :
+        minval >= SCHAR_MIN && maxval <= SCHAR_MAX ? CV_8S :
+        minval >= 0 && maxval <= USHRT_MAX ? CV_16U :
+        minval >= SHRT_MIN && maxval <= SHRT_MAX ? CV_16S :
+        CV_32S;
 }

 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
@@ -1264,7 +1257,9 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,

    bool haveScalar = false, swapped12 = false;
    int depth2 = src2.depth();
-    if( src1.size != src2.size || src1.channels() != src2.channels() )
+    if( src1.size != src2.size || src1.channels() != src2.channels() ||
+        ((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) &&
+         src1.cols == 1 && src2.rows == 4) )
    {
        if( checkScalar(src1, src2.type(), kind1, kind2) )
        {
@@ -1279,10 +1274,14 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        haveScalar = true;
        CV_Assert(src2.type() == CV_64F && (src2.rows == 4 || src2.rows == 1));

-        if (usrdata == 0) // hack to filter out multiply and divide
+        if (!muldiv)
+        {
            depth2 = actualScalarDepth(src2.ptr<double>(), src1.channels());
+            if( depth2 == CV_64F && (src1.depth() < CV_32S || src1.depth() == CV_32F) )
+                depth2 = CV_32F;
+        }
        else
-            depth2 = CV_64F;
+            depth2 = src1.depth() < CV_32S || src1.depth() == CV_32F ? CV_32F : CV_64F;
    }

    int cn = src1.channels(), depth1 = src1.depth(), wtype;

--- a/modules/core/src/parallel.cpp
+++ b/modules/core/src/parallel.cpp
@@ -42,6 +42,17 @@

 #include "precomp.hpp"

+#if defined __linux__ || defined __APPLE__
+    #include <unistd.h>
+    #include <stdio.h>
+    #include <sys/types.h>
+    #if defined ANDROID
+        #include <sys/sysconf.h>
+    #else
+        #include <sys/sysctl.h>
+    #endif
+#endif
+
 #ifdef _OPENMP
    #define HAVE_OPENMP
 #endif
@@ -85,7 +96,6 @@
        #include <omp.h>
    #elif defined HAVE_GCD
        #include <dispatch/dispatch.h>
-        #include <sys/sysctl.h>
        #include <pthread.h>
    #elif defined HAVE_CONCURRENCY
        #include <ppl.h>

--- a/modules/core/test/test_operations.cpp
+++ b/modules/core/test/test_operations.cpp
@@ -76,6 +76,7 @@ protected:
    bool TestVec();
    bool TestMatxMultiplication();
    bool TestSubMatAccess();
+    bool TestExp();
    bool TestSVD();
    bool operations1();

@@ -1003,6 +1004,17 @@ bool CV_OperationsTest::operations1()
 }


+bool CV_OperationsTest::TestExp()
+{
+    Mat1f tt = Mat1f::ones(4,2);
+    Mat1f outs;
+    exp(-tt, outs);
+    Mat1f tt2 = Mat1f::ones(4,1), outs2;
+    exp(-tt2, outs2);
+    return true;
+}
+
+
 bool CV_OperationsTest::TestSVD()
 {
    try
@@ -1079,6 +1091,9 @@ void CV_OperationsTest::run( int /* start_from */)
    if (!TestSubMatAccess())
        return;

+    if (!TestExp())
+        return;
+
    if (!TestSVD())
        return;


--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -377,10 +377,13 @@ namespace cv { namespace gpu { namespace device
        }

        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    } // namespace column_filter
 }}} // namespace cv { namespace gpu { namespace device


--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -376,10 +376,13 @@ namespace cv { namespace gpu { namespace device
        }

        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    } // namespace row_filter
 }}} // namespace cv { namespace gpu { namespace device


--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -922,7 +922,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

-    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1);
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_16SC3 || srcType == CV_32SC1 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(srcType) == CV_MAT_CN(bufType));

@@ -942,6 +942,9 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    case CV_8UC1:
        func = linearRowFilter_gpu<uchar, float>;
        break;
+    case CV_8UC3:
+        func = linearRowFilter_gpu<uchar3, float3>;
+        break;
    case CV_8UC4:
        func = linearRowFilter_gpu<uchar4, float4>;
        break;
@@ -954,6 +957,12 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    case CV_32FC1:
        func = linearRowFilter_gpu<float, float>;
        break;
+    case CV_32FC3:
+        func = linearRowFilter_gpu<float3, float3>;
+        break;
+    case CV_32FC4:
+        func = linearRowFilter_gpu<float4, float4>;
+        break;
    }

    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, gpuBorderType));
@@ -1034,7 +1043,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));

-    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);
+    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC3 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1 || dstType == CV_32FC3 || dstType == CV_32FC4);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));

@@ -1054,6 +1063,9 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    case CV_8UC1:
        func = linearColumnFilter_gpu<float, uchar>;
        break;
+    case CV_8UC3:
+        func = linearColumnFilter_gpu<float3, uchar3>;
+        break;
    case CV_8UC4:
        func = linearColumnFilter_gpu<float4, uchar4>;
        break;
@@ -1066,6 +1078,12 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    case CV_32FC1:
        func = linearColumnFilter_gpu<float, float>;
        break;
+    case CV_32FC3:
+        func = linearColumnFilter_gpu<float3, float3>;
+        break;
+    case CV_32FC4:
+        func = linearColumnFilter_gpu<float4, float4>;
+        break;
    }

    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, gpuBorderType));

--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -152,13 +152,13 @@ TEST_P(Sobel, Accuracy)
    cv::Mat dst_gold;
    cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);

-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
    testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
    testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
@@ -208,13 +208,13 @@ TEST_P(Scharr, Accuracy)
    cv::Mat dst_gold;
    cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);

-    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }

 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(Deriv_X(0), Deriv_X(1)),
    testing::Values(Deriv_Y(0), Deriv_Y(1)),
    testing::Values(BorderType(cv::BORDER_REFLECT101),
@@ -281,7 +281,7 @@ TEST_P(GaussianBlur, Accuracy)
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(KSize(cv::Size(3, 3)),
                    KSize(cv::Size(5, 5)),
                    KSize(cv::Size(7, 7)),

--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -221,6 +221,18 @@ static const void* initInterTab2D( int method, bool fixpt )
 }


+static bool initAllInterTab2D()
+{
+    return  initInterTab2D( INTER_LINEAR, false ) &&
+            initInterTab2D( INTER_LINEAR, true ) &&
+            initInterTab2D( INTER_CUBIC, false ) &&
+            initInterTab2D( INTER_CUBIC, true ) &&
+            initInterTab2D( INTER_LANCZOS4, false ) &&
+            initInterTab2D( INTER_LANCZOS4, true );
+}
+
+static volatile bool doInitAllInterTab2D = initAllInterTab2D();
+
 template<typename ST, typename DT> struct Cast
 {
    typedef ST type1;
@@ -1390,72 +1402,85 @@ struct DecimateAlpha
    float alpha;
 };

-template <typename T, typename WT>
-class resizeArea_Invoker :
+
+template<typename T, typename WT> class ResizeArea_Invoker :
    public ParallelLoopBody
 {
 public:
-    resizeArea_Invoker(const Mat& _src, Mat& _dst, const DecimateAlpha* _xofs,
-        int _xofs_count, double _scale_y_, const int* _cur_dy_ofs,
-        const std::vector<std::pair<int, int> >& _bands) :
-        ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs),
-        xofs_count(_xofs_count), scale_y_(_scale_y_),
-        cur_dy_ofs(_cur_dy_ofs), bands(_bands)
+    ResizeArea_Invoker( const Mat& _src, Mat& _dst,
+                        const DecimateAlpha* _xtab, int _xtab_size,
+                        const DecimateAlpha* _ytab, int _ytab_size,
+                        const int* _tabofs )
    {
+        src = &_src;
+        dst = &_dst;
+        xtab0 = _xtab;
+        xtab_size0 = _xtab_size;
+        ytab = _ytab;
+        ytab_size = _ytab_size;
+        tabofs = _tabofs;
    }

-    void resize_single_band(const Range& range) const
+    virtual void operator() (const Range& range) const
    {
-        Size ssize = src.size(), dsize = dst.size();
-        int cn = src.channels();
+        Size dsize = dst->size();
+        int cn = dst->channels();
        dsize.width *= cn;
        AutoBuffer<WT> _buffer(dsize.width*2);
+        const DecimateAlpha* xtab = xtab0;
+        int xtab_size = xtab_size0;
        WT *buf = _buffer, *sum = buf + dsize.width;
-        int k = 0, sy = 0, dx = 0, cur_dy = 0;
-        WT scale_y = (WT)scale_y_;
+        int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;

-        CV_Assert( cn <= 4 );
        for( dx = 0; dx < dsize.width; dx++ )
-            buf[dx] = sum[dx] = 0;
+            sum[dx] = (WT)0;

-        cur_dy = cur_dy_ofs[range.start];
-        for (sy = range.start; sy < range.end; sy++)
+        for( j = j_start; j < j_end; j++ )
        {
-            const T* S = (const T*)(src.data + src.step*sy);
+            WT beta = ytab[j].alpha;
+            int dy = ytab[j].di;
+            int sy = ytab[j].si;
+
+            {
+                const T* S = (const T*)(src->data + src->step*sy);
+                for( dx = 0; dx < dsize.width; dx++ )
+                    buf[dx] = (WT)0;
+
                if( cn == 1 )
-                for( k = 0; k < xofs_count; k++ )
+                    for( k = 0; k < xtab_size; k++ )
                    {
-                    int dxn = xofs[k].di;
-                    WT alpha = xofs[k].alpha;
-                    buf[dxn] += S[xofs[k].si]*alpha;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        buf[dxn] += S[xtab[k].si]*alpha;
                    }
                else if( cn == 2 )
-                for( k = 0; k < xofs_count; k++ )
+                    for( k = 0; k < xtab_size; k++ )
                    {
-                    int sxn = xofs[k].si;
-                    int dxn = xofs[k].di;
-                    WT alpha = xofs[k].alpha;
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
                        WT t0 = buf[dxn] + S[sxn]*alpha;
                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
                        buf[dxn] = t0; buf[dxn+1] = t1;
                    }
                else if( cn == 3 )
-                for( k = 0; k < xofs_count; k++ )
+                    for( k = 0; k < xtab_size; k++ )
                    {
-                    int sxn = xofs[k].si;
-                    int dxn = xofs[k].di;
-                    WT alpha = xofs[k].alpha;
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
                        WT t0 = buf[dxn] + S[sxn]*alpha;
                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
                        WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
                        buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
                    }
-            else
-                for( k = 0; k < xofs_count; k++ )
+                else if( cn == 4 )
                {
-                    int sxn = xofs[k].si;
-                    int dxn = xofs[k].di;
-                    WT alpha = xofs[k].alpha;
+                    for( k = 0; k < xtab_size; k++ )
+                    {
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
                        WT t0 = buf[dxn] + S[sxn]*alpha;
                        WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
                        buf[dxn] = t0; buf[dxn+1] = t1;
@@ -1463,99 +1488,64 @@ public:
                        t1 = buf[dxn+3] + S[sxn+3]*alpha;
                        buf[dxn+2] = t0; buf[dxn+3] = t1;
                    }
-
-            if( (cur_dy + 1)*scale_y <= sy + 1 || sy == ssize.height - 1 )
-            {
-                WT beta = std::max(sy + 1 - (cur_dy+1)*scale_y, (WT)0);
-                WT beta1 = 1 - beta;
-                T* D = (T*)(dst.data + dst.step*cur_dy);
-                if( fabs(beta) < 1e-3 )
+                }
+                else
                {
-                    if(cur_dy >= dsize.height)
-                        return;
-                    for( dx = 0; dx < dsize.width; dx++ )
+                    for( k = 0; k < xtab_size; k++ )
                    {
-                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx]) / min(scale_y, src.rows - cur_dy * scale_y)); //
-                        sum[dx] = buf[dx] = 0;
+                        int sxn = xtab[k].si;
+                        int dxn = xtab[k].di;
+                        WT alpha = xtab[k].alpha;
+                        for( int c = 0; c < cn; c++ )
+                            buf[dxn + c] += S[sxn + c]*alpha;
                    }
                }
-                else
+            }
+
+            if( dy != prev_dy )
+            {
+                T* D = (T*)(dst->data + dst->step*prev_dy);
+
                for( dx = 0; dx < dsize.width; dx++ )
                {
-                        D[dx] = saturate_cast<T>((sum[dx] + buf[dx]* beta1)/ min(scale_y, src.rows - cur_dy*scale_y)); //
-                        sum[dx] = buf[dx]*beta;
-                        buf[dx] = 0;
+                    D[dx] = saturate_cast<T>(sum[dx]);
+                    sum[dx] = beta*buf[dx];
                }
-                cur_dy++;
+                prev_dy = dy;
            }
            else
            {
-                for( dx = 0; dx <= dsize.width - 2; dx += 2 )
-                {
-                    WT t0 = sum[dx] + buf[dx];
-                    WT t1 = sum[dx+1] + buf[dx+1];
-                    sum[dx] = t0; sum[dx+1] = t1;
-                    buf[dx] = buf[dx+1] = 0;
-                }
-                for( ; dx < dsize.width; dx++ )
-                {
-                    sum[dx] += buf[dx];
-                    buf[dx] = 0;
-                }
-            }
+                for( dx = 0; dx < dsize.width; dx++ )
+                    sum[dx] += beta*buf[dx];
            }
        }

-    virtual void operator() (const Range& range) const
        {
-        for (int i = range.start; i < range.end; ++i)
-        {
-            Range band_range(bands[i].first, bands[i].second);
-            resize_single_band(band_range);
+        T* D = (T*)(dst->data + dst->step*prev_dy);
+        for( dx = 0; dx < dsize.width; dx++ )
+            D[dx] = saturate_cast<T>(sum[dx]);
        }
    }

 private:
-    Mat src;
-    Mat dst;
-    const DecimateAlpha* xofs;
-    int xofs_count;
-    double scale_y_;
-    const int *cur_dy_ofs;
-    std::vector<std::pair<int, int> > bands;
+    const Mat* src;
+    Mat* dst;
+    const DecimateAlpha* xtab0;
+    const DecimateAlpha* ytab;
+    int xtab_size0, ytab_size;
+    const int* tabofs;
 };

-template <typename T, typename WT>
-static void resizeArea_( const Mat& src, Mat& dst, const DecimateAlpha* xofs, int xofs_count, double scale_y_)
-{
-    Size ssize = src.size(), dsize = dst.size();
-    AutoBuffer<int> _yofs(ssize.height);
-    int *cur_dy_ofs = _yofs;
-    int cur_dy = 0, index = 0;
-    std::vector<std::pair<int, int> > bands;
-
-    for (int sy = 0; sy < ssize.height; sy++)
-    {
-        cur_dy_ofs[sy] = cur_dy;
-
-        if ((cur_dy + 1) * scale_y_ <= sy + 1 || sy == ssize.height - 1 )
-        {
-            WT beta = (WT)std::max(sy + 1 - (cur_dy + 1) * scale_y_, 0.);
-            if (fabs(beta) < 1e-3 )
-            {
-                if (cur_dy >= dsize.height)
-                    break;
-                bands.push_back(std::make_pair(index, sy + 1));
-                index = sy + 1;
-            }
-            cur_dy++;
-        }
-    }

-    Range range(0, (int)bands.size());
-    resizeArea_Invoker<T, WT> invoker(src, dst, xofs, xofs_count, scale_y_, cur_dy_ofs, bands);
-    //parallel_for_(range, invoker);
-    invoker(Range(range.start, range.end));
+template <typename T, typename WT>
+static void resizeArea_( const Mat& src, Mat& dst,
+                         const DecimateAlpha* xtab, int xtab_size,
+                         const DecimateAlpha* ytab, int ytab_size,
+                         const int* tabofs )
+{
+    parallel_for_(Range(0, dst.rows),
+                 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
+                 dst.total()/((double)(1 << 16)));
 }


@@ -1569,8 +1559,52 @@ typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
                                    int scale_x, int scale_y );

 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
-                                const DecimateAlpha* xofs, int xofs_count,
-                                double scale_y_);
+                                const DecimateAlpha* xtab, int xtab_size,
+                                const DecimateAlpha* ytab, int ytab_size,
+                                const int* yofs);
+
+
+static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
+{
+    int k = 0;
+    for(int dx = 0; dx < dsize; dx++ )
+    {
+        double fsx1 = dx * scale;
+        double fsx2 = fsx1 + scale;
+        double cellWidth = min(scale, ssize - fsx1);
+
+        int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
+
+        sx2 = std::min(sx2, ssize - 1);
+        sx1 = std::min(sx1, sx2);
+
+        if( sx1 - fsx1 > 1e-3 )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = (sx1 - 1) * cn;
+            tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
+        }
+
+        for(int sx = sx1; sx < sx2; sx++ )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = sx * cn;
+            tab[k++].alpha = float(1.0 / cellWidth);
+        }
+
+        if( fsx2 - sx2 > 1e-3 )
+        {
+            assert( k < ssize*2 );
+            tab[k].di = dx * cn;
+            tab[k].si = sx2 * cn;
+            tab[k++].alpha = (float)(min(min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
+        }
+    }
+    return k;
+}
+

 }

@@ -1766,43 +1800,25 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
            ResizeAreaFunc func = area_tab[depth];
            CV_Assert( func != 0 && cn <= 4 );

-            AutoBuffer<DecimateAlpha> _xofs(ssize.width*2);
-            DecimateAlpha* xofs = _xofs;
-
-            for( dx = 0, k = 0; dx < dsize.width; dx++ )
-            {
-                double fsx1 = dx*scale_x;
-                double fsx2 = fsx1 + scale_x;
-                int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
-                sx1 = std::min(sx1, ssize.width-1);
-                sx2 = std::min(sx2, ssize.width-1);
+            AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
+            DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;

-                if( sx1 > fsx1 )
-                {
-                    assert( k < ssize.width*2 );
-                    xofs[k].di = dx*cn;
-                    xofs[k].si = (sx1-1)*cn;
-                    xofs[k++].alpha = (float)((sx1 - fsx1) / min(scale_x, src.cols - fsx1));
-                }
+            int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
+            int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);

-                for( sx = sx1; sx < sx2; sx++ )
+            AutoBuffer<int> _tabofs(dsize.height + 1);
+            int* tabofs = _tabofs;
+            for( k = 0, dy = 0; k < ytab_size; k++ )
            {
-                    assert( k < ssize.width*2 );
-                    xofs[k].di = dx*cn;
-                    xofs[k].si = sx*cn;
-                    xofs[k++].alpha = float(1.0 / min(scale_x, src.cols - fsx1));
-                }
-
-                if( fsx2 - sx2 > 1e-3 )
+                if( k == 0 || ytab[k].di != ytab[k-1].di )
                {
-                    assert( k < ssize.width*2 );
-                    xofs[k].di = dx*cn;
-                    xofs[k].si = sx2*cn;
-                    xofs[k++].alpha = (float)(min(fsx2 - sx2, 1.) / min(scale_x, src.cols - fsx1));
+                    assert( ytab[k].di == dy );
+                    tabofs[dy++] = k;
                }
            }
+            tabofs[dy] = ytab_size;

-            func( src, dst, xofs, k, scale_y);
+            func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
            return;
        }
    }