Merge pull request #13879 from chacha21:REDUCE_SUM2

add REDUCE_SUM2 #13879 proposal to add REDUCE_SUM2 to cv::reduce, an operation that sums up the square of elements

Merge pull request #13879 from chacha21:REDUCE_SUM2
add REDUCE_SUM2 #13879 proposal to add REDUCE_SUM2 to cv::reduce, an operation that sums up the square of elements
6dd8a9b6 · Pierre Chatelier · GitHub · 23b819ef · 6dd8a9b6 · 6dd8a9b6
9 changed file
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -230,7 +230,8 @@ enum KmeansFlags {
 enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
                   REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
                   REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
-                   REDUCE_MIN = 3  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_MIN = 3,  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_SUM2 = 4  //!< the output is the sum of all squared rows/columns of the matrix.
                 };

 //! @} core_array
@@ -903,7 +904,7 @@ The function #reduce reduces the matrix to a vector by treating the matrix rows/
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
 raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
-In case of #REDUCE_SUM and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
 And multi-channel arrays are also supported in these two reduction modes.

 The following code demonstrates its usage for a single channel matrix.

--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -1150,7 +1150,7 @@ OCL_PERF_TEST_P(ReduceMinMaxFixture, Reduce,
    SANITY_CHECK(dst, eps);
 }

-CV_ENUM(ReduceAccOp, CV_REDUCE_SUM, CV_REDUCE_AVG)
+CV_ENUM(ReduceAccOp, REDUCE_SUM, REDUCE_AVG, REDUCE_SUM2)

 typedef tuple<Size, std::pair<MatType, MatType>, int, ReduceAccOp> ReduceAccParams;
 typedef TestBaseWithParam<ReduceAccParams> ReduceAccFixture;
@@ -1168,7 +1168,6 @@ OCL_PERF_TEST_P(ReduceAccFixture, Reduce,
            dim = get<2>(params), op = get<3>(params);
    const Size srcSize = get<0>(params),
            dstSize(dim == 0 ? srcSize.width : 1, dim == 0 ? 1 : srcSize.height);
-    const double eps = CV_MAT_DEPTH(dtype) <= CV_32S ? 1 : 3e-4;

    checkDeviceMaxMemoryAllocSize(srcSize, stype);
    checkDeviceMaxMemoryAllocSize(srcSize, dtype);
@@ -1178,7 +1177,7 @@ OCL_PERF_TEST_P(ReduceAccFixture, Reduce,

    OCL_TEST_CYCLE() cv::reduce(src, dst, dim, op, dtype);

-    SANITY_CHECK(dst, eps);
+    SANITY_CHECK_NOTHING();
 }

 } } // namespace opencv_test::ocl

--- a/modules/core/perf/perf_reduce.cpp
+++ b/modules/core/perf/perf_reduce.cpp
@@ -5,7 +5,7 @@ namespace opencv_test
 {
 using namespace perf;

-CV_ENUM(ROp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+CV_ENUM(ROp, REDUCE_SUM, REDUCE_AVG, REDUCE_MAX, REDUCE_MIN, REDUCE_SUM2)
 typedef tuple<Size, MatType, ROp> Size_MatType_ROp_t;
 typedef perf::TestBaseWithParam<Size_MatType_ROp_t> Size_MatType_ROp;

@@ -23,7 +23,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
    int reduceOp = get<2>(GetParam());

    int ddepth = -1;
-    if( CV_MAT_DEPTH(matType) < CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG) )
+    if( CV_MAT_DEPTH(matType) < CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_SUM2) )
        ddepth = CV_32S;

    Mat src(sz, matType);
@@ -35,7 +35,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
    int runs = 15;
    TEST_CYCLE_MULTIRUN(runs) reduce(src, vec, 0, reduceOp, ddepth);

-    SANITY_CHECK(vec, 1);
+    SANITY_CHECK_NOTHING();
 }

 PERF_TEST_P(Size_MatType_ROp, reduceC,
@@ -51,7 +51,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,
    int reduceOp = get<2>(GetParam());

    int ddepth = -1;
-    if( CV_MAT_DEPTH(matType)< CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG) )
+    if( CV_MAT_DEPTH(matType)< CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_SUM2) )
        ddepth = CV_32S;

    Mat src(sz, matType);
@@ -62,7 +62,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,

    TEST_CYCLE() reduce(src, vec, 1, reduceOp, ddepth);

-    SANITY_CHECK(vec, 1);
+    SANITY_CHECK_NOTHING();
 }

 typedef tuple<Size, MatType, int> Size_MatType_RMode_t;

--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -341,29 +341,32 @@ cv::Mat cv::Mat::cross(InputArray _m) const
 namespace cv
 {

-template<typename T, typename ST, class Op> static void
-reduceR_( const Mat& srcmat, Mat& dstmat )
+template<typename T, typename ST, typename WT, class Op, class OpInit>
+class ReduceR_Invoker : public ParallelLoopBody
 {
-    typedef typename Op::rtype WT;
-    Size size = srcmat.size();
-    size.width *= srcmat.channels();
-    AutoBuffer<WT> buffer(size.width);
+public:
+  ReduceR_Invoker(const Mat& aSrcmat, Mat& aDstmat, Op& aOp, OpInit& aOpInit)
+                 :srcmat(aSrcmat),dstmat(aDstmat),op(aOp),opInit(aOpInit),buffer(srcmat.size().width*srcmat.channels())
+  {
+  }
+  void operator()(const Range& range) const CV_OVERRIDE
+  {
+    const T* src = srcmat.ptr<T>();
+    const size_t srcstep = srcmat.step/sizeof(src[0]);
    WT* buf = buffer.data();
    ST* dst = dstmat.ptr<ST>();
-    const T* src = srcmat.ptr<T>();
-    size_t srcstep = srcmat.step/sizeof(src[0]);
-    int i;
-    Op op;
+    int i = 0;

-    for( i = 0; i < size.width; i++ )
-        buf[i] = src[i];
+    for( i = range.start ; i < range.end; i++ )
+        buf[i] = opInit(src[i]);

-    for( ; --size.height; )
+    int height = srcmat.size().height;
+    for( ; --height; )
    {
        src += srcstep;
-        i = 0;
+        i = range.start;
        #if CV_ENABLE_UNROLLED
-        for(; i <= size.width - 4; i += 4 )
+        for(; i <= range.end - 4; i += 4 )
        {
            WT s0, s1;
            s0 = op(buf[i], (WT)src[i]);
@@ -375,63 +378,94 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
            buf[i+2] = s0; buf[i+3] = s1;
        }
        #endif
-        for( ; i < size.width; i++ )
+        for( ; i < range.end; i++ )
            buf[i] = op(buf[i], (WT)src[i]);
    }

-    for( i = 0; i < size.width; i++ )
+    for( i = range.start ; i < range.end; i++ )
        dst[i] = (ST)buf[i];
-}
-
+  }
+private:
+  const Mat& srcmat;
+  Mat& dstmat;
+  Op& op;
+  OpInit& opInit;
+  mutable AutoBuffer<WT> buffer;
+};

-template<typename T, typename ST, class Op> static void
-reduceC_( const Mat& srcmat, Mat& dstmat )
+template<typename T, typename ST, class Op, class OpInit = OpNop<ST> > static void
+reduceR_( const Mat& srcmat, Mat& dstmat)
 {
    typedef typename Op::rtype WT;
-    Size size = srcmat.size();
-    int cn = srcmat.channels();
-    size.width *= cn;
    Op op;
+    OpInit opInit;
+
+    ReduceR_Invoker<T, ST, WT, Op, OpInit> body(srcmat, dstmat, op, opInit);
+    //group columns by 64 bytes for data locality
+    parallel_for_(Range(0, srcmat.size().width*srcmat.channels()), body, srcmat.size().width*CV_ELEM_SIZE(srcmat.depth())/64);
+}

-    for( int y = 0; y < size.height; y++ )
+template<typename T, typename ST, typename WT, class Op, class OpInit>
+class ReduceC_Invoker : public ParallelLoopBody
+{
+public:
+  ReduceC_Invoker(const Mat& aSrcmat, Mat& aDstmat, Op& aOp, OpInit& aOpInit)
+                 :srcmat(aSrcmat),dstmat(aDstmat),op(aOp),opInit(aOpInit)
+  {
+  }
+  void operator()(const Range& range) const CV_OVERRIDE
+  {
+    const int cn = srcmat.channels();
+    const int width = srcmat.size().width*cn;
+    AutoBuffer<WT> cumul(cn);
+    for( int y = range.start; y < range.end; y++ )
    {
        const T* src = srcmat.ptr<T>(y);
        ST* dst = dstmat.ptr<ST>(y);
-        if( size.width == cn )
-            for( int k = 0; k < cn; k++ )
-                dst[k] = src[k];
+        if( width == cn )
+        {
+          for( int k = 0; k < cn; k++ )
+              dst[k] = (ST)opInit(src[k]);
+        }
        else
        {
-            for( int k = 0; k < cn; k++ )
+            for(int k = 0; k < cn ; ++k )
+              cumul[k] = opInit(src[k]);
+            for(int k = cn ; k < width ; k += cn )
            {
-                WT a0 = src[k], a1 = src[k+cn];
-                int i;
-                for( i = 2*cn; i <= size.width - 4*cn; i += 4*cn )
-                {
-                    a0 = op(a0, (WT)src[i+k]);
-                    a1 = op(a1, (WT)src[i+k+cn]);
-                    a0 = op(a0, (WT)src[i+k+cn*2]);
-                    a1 = op(a1, (WT)src[i+k+cn*3]);
-                }
-
-                for( ; i < size.width; i += cn )
-                {
-                    a0 = op(a0, (WT)src[i+k]);
-                }
-                a0 = op(a0, a1);
-                dst[k] = (ST)a0;
+                for (int c = 0 ; c < cn ; ++c)
+                  cumul[c] = op(cumul[c], src[k+c]);
            }
+            for(int k = 0 ; k < cn ; ++k )
+              dst[k] = (ST)cumul[k];
        }
    }
+  }
+private:
+  const Mat& srcmat;
+  Mat& dstmat;
+  Op& op;
+  OpInit& opInit;
+};
+
+template<typename T, typename ST, class Op, class OpInit = OpNop<ST> > static void
+reduceC_( const Mat& srcmat, Mat& dstmat)
+{
+    typedef typename Op::rtype WT;
+    Op op;
+    OpInit opInit;
+
+    ReduceC_Invoker<T, ST, WT, Op, OpInit> body(srcmat, dstmat, op, opInit);
+    parallel_for_(Range(0, srcmat.size().height), body);
 }

 typedef void (*ReduceFunc)( const Mat& src, Mat& dst );

 }

-#define reduceSumR8u32s  reduceR_<uchar, int,   OpAdd<int> >
-#define reduceSumR8u32f  reduceR_<uchar, float, OpAdd<int> >
-#define reduceSumR8u64f  reduceR_<uchar, double,OpAdd<int> >
+#define reduceSumR8u32s  reduceR_<uchar, int,   OpAdd<int>, OpNop<int> >
+#define reduceSumR8u32f  reduceR_<uchar, float, OpAdd<int>, OpNop<int> >
+#define reduceSumR8u64f  reduceR_<uchar, double,OpAdd<int>, OpNop<int> >
 #define reduceSumR16u32f reduceR_<ushort,float, OpAdd<float> >
 #define reduceSumR16u64f reduceR_<ushort,double,OpAdd<double> >
 #define reduceSumR16s32f reduceR_<short, float, OpAdd<float> >
@@ -440,6 +474,17 @@ typedef void (*ReduceFunc)( const Mat& src, Mat& dst );
 #define reduceSumR32f64f reduceR_<float, double,OpAdd<double> >
 #define reduceSumR64f64f reduceR_<double,double,OpAdd<double> >

+#define reduceSum2R8u32s  reduceR_<uchar, int,   OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R8u32f  reduceR_<uchar, float, OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R8u64f  reduceR_<uchar, double,OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R16u32f reduceR_<ushort,float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R16u64f reduceR_<ushort,double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R16s32f reduceR_<short, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R16s64f reduceR_<short, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R32f32f reduceR_<float, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R32f64f reduceR_<float, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R64f64f reduceR_<double,double,OpAddSqr<double>,OpSqr<double> >
+
 #define reduceMaxR8u  reduceR_<uchar, uchar, OpMax<uchar> >
 #define reduceMaxR16u reduceR_<ushort,ushort,OpMax<ushort> >
 #define reduceMaxR16s reduceR_<short, short, OpMax<short> >
@@ -527,23 +572,35 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds

 #endif

-#define reduceSumC8u32s  reduceC_<uchar, int,   OpAdd<int> >
-#define reduceSumC8u32f  reduceC_<uchar, float, OpAdd<int> >
+#define reduceSumC8u32s  reduceC_<uchar, int,   OpAdd<int>, OpNop<int> >
+#define reduceSumC8u32f  reduceC_<uchar, float, OpAdd<int>, OpNop<int> >
 #define reduceSumC16u32f reduceC_<ushort,float, OpAdd<float> >
 #define reduceSumC16s32f reduceC_<short, float, OpAdd<float> >
 #define reduceSumC32f32f reduceC_<float, float, OpAdd<float> >
 #define reduceSumC64f64f reduceC_<double,double,OpAdd<double> >

+#define reduceSum2C8u32s  reduceC_<uchar, int,   OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C8u32f  reduceC_<uchar, float, OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C16u32f reduceC_<ushort,float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C16s32f reduceC_<short, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C32f32f reduceC_<float, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C64f64f reduceC_<double,double,OpAddSqr<double>,OpSqr<double> >
+
 #ifdef HAVE_IPP
 #define reduceSumC8u64f  reduceSumC_8u16u16s32f_64f
 #define reduceSumC16u64f reduceSumC_8u16u16s32f_64f
 #define reduceSumC16s64f reduceSumC_8u16u16s32f_64f
 #define reduceSumC32f64f reduceSumC_8u16u16s32f_64f
 #else
-#define reduceSumC8u64f  reduceC_<uchar, double,OpAdd<int> >
+#define reduceSumC8u64f  reduceC_<uchar, double,OpAdd<int>, OpNop<int> >
 #define reduceSumC16u64f reduceC_<ushort,double,OpAdd<double> >
 #define reduceSumC16s64f reduceC_<short, double,OpAdd<double> >
 #define reduceSumC32f64f reduceC_<float, double,OpAdd<double> >
+
+#define reduceSum2C8u64f  reduceC_<uchar, double,OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C16u64f reduceC_<ushort,double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2C16s64f reduceC_<short, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2C32f64f reduceC_<float, double,OpAddSqr<double>,OpSqr<double> >
 #endif

 #ifdef HAVE_IPP
@@ -622,8 +679,9 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
            ddepth = CV_32S;
    }

-    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
-                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
+    const char * const ops[5] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
+                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN",
+                                  "OCL_CV_REDUCE_SUM2"};
    int wdepth = std::max(ddepth, CV_32F);
    if (useOptimized)
    {
@@ -718,7 +776,8 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)

    CV_Assert( cn == CV_MAT_CN(dtype) );
    CV_Assert( op == REDUCE_SUM || op == REDUCE_MAX ||
-               op == REDUCE_MIN || op == REDUCE_AVG );
+               op == REDUCE_MIN || op == REDUCE_AVG ||
+               op == REDUCE_SUM2);

    CV_OCL_RUN(_dst.isUMat(),
               ocl_reduce(_src, _dst, dim, op, op0, stype, dtype))
@@ -748,7 +807,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
        if( op == REDUCE_SUM )
        {
            if(sdepth == CV_8U && ddepth == CV_32S)
-                func = GET_OPTIMIZED(reduceSumR8u32s);
+                func = reduceSumR8u32s;
            else if(sdepth == CV_8U && ddepth == CV_32F)
                func = reduceSumR8u32f;
            else if(sdepth == CV_8U && ddepth == CV_64F)
@@ -762,7 +821,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
            else if(sdepth == CV_16S && ddepth == CV_64F)
                func = reduceSumR16s64f;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceSumR32f32f);
+                func = reduceSumR32f32f;
            else if(sdepth == CV_32F && ddepth == CV_64F)
                func = reduceSumR32f64f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
@@ -771,36 +830,59 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
        else if(op == REDUCE_MAX)
        {
            if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMaxR8u);
+                func = reduceMaxR8u;
            else if(sdepth == CV_16U && ddepth == CV_16U)
                func = reduceMaxR16u;
            else if(sdepth == CV_16S && ddepth == CV_16S)
                func = reduceMaxR16s;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMaxR32f);
+                func = reduceMaxR32f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
                func = reduceMaxR64f;
        }
        else if(op == REDUCE_MIN)
        {
            if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMinR8u);
+                func = reduceMinR8u;
            else if(sdepth == CV_16U && ddepth == CV_16U)
                func = reduceMinR16u;
            else if(sdepth == CV_16S && ddepth == CV_16S)
                func = reduceMinR16s;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMinR32f);
+                func = reduceMinR32f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
                func = reduceMinR64f;
        }
+        else if( op == REDUCE_SUM2 )
+        {
+            if(sdepth == CV_8U && ddepth == CV_32S)
+                func = reduceSum2R8u32s;
+            else if(sdepth == CV_8U && ddepth == CV_32F)
+                func = reduceSum2R8u32f;
+            else if(sdepth == CV_8U && ddepth == CV_64F)
+                func = reduceSum2R8u64f;
+            else if(sdepth == CV_16U && ddepth == CV_32F)
+                func = reduceSum2R16u32f;
+            else if(sdepth == CV_16U && ddepth == CV_64F)
+                func = reduceSum2R16u64f;
+            else if(sdepth == CV_16S && ddepth == CV_32F)
+                func = reduceSum2R16s32f;
+            else if(sdepth == CV_16S && ddepth == CV_64F)
+                func = reduceSum2R16s64f;
+            else if(sdepth == CV_32F && ddepth == CV_32F)
+                func = reduceSum2R32f32f;
+            else if(sdepth == CV_32F && ddepth == CV_64F)
+                func = reduceSum2R32f64f;
+            else if(sdepth == CV_64F && ddepth == CV_64F)
+                func = reduceSum2R64f64f;
+        }
    }
    else
    {
        if(op == REDUCE_SUM)
        {
            if(sdepth == CV_8U && ddepth == CV_32S)
-                func = GET_OPTIMIZED(reduceSumC8u32s);
+                func = reduceSumC8u32s;
            else if(sdepth == CV_8U && ddepth == CV_32F)
                func = reduceSumC8u32f;
            else if(sdepth == CV_8U && ddepth == CV_64F)
@@ -814,7 +896,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
            else if(sdepth == CV_16S && ddepth == CV_64F)
                func = reduceSumC16s64f;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceSumC32f32f);
+                func = reduceSumC32f32f;
            else if(sdepth == CV_32F && ddepth == CV_64F)
                func = reduceSumC32f64f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
@@ -823,29 +905,52 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
        else if(op == REDUCE_MAX)
        {
            if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMaxC8u);
+                func = reduceMaxC8u;
            else if(sdepth == CV_16U && ddepth == CV_16U)
                func = reduceMaxC16u;
            else if(sdepth == CV_16S && ddepth == CV_16S)
                func = reduceMaxC16s;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMaxC32f);
+                func = reduceMaxC32f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
                func = reduceMaxC64f;
        }
        else if(op == REDUCE_MIN)
        {
            if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMinC8u);
+                func = reduceMinC8u;
            else if(sdepth == CV_16U && ddepth == CV_16U)
                func = reduceMinC16u;
            else if(sdepth == CV_16S && ddepth == CV_16S)
                func = reduceMinC16s;
            else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMinC32f);
+                func = reduceMinC32f;
            else if(sdepth == CV_64F && ddepth == CV_64F)
                func = reduceMinC64f;
        }
+        else if(op == REDUCE_SUM2)
+        {
+            if(sdepth == CV_8U && ddepth == CV_32S)
+                func = reduceSum2C8u32s;
+            else if(sdepth == CV_8U && ddepth == CV_32F)
+                func = reduceSum2C8u32f;
+            else if(sdepth == CV_8U && ddepth == CV_64F)
+                func = reduceSum2C8u64f;
+            else if(sdepth == CV_16U && ddepth == CV_32F)
+                func = reduceSum2C16u32f;
+            else if(sdepth == CV_16U && ddepth == CV_64F)
+                func = reduceSum2C16u64f;
+            else if(sdepth == CV_16S && ddepth == CV_32F)
+                func = reduceSum2C16s32f;
+            else if(sdepth == CV_16S && ddepth == CV_64F)
+                func = reduceSum2C16s64f;
+            else if(sdepth == CV_32F && ddepth == CV_32F)
+                func = reduceSum2C32f32f;
+            else if(sdepth == CV_32F && ddepth == CV_64F)
+                func = reduceSum2C32f64f;
+            else if(sdepth == CV_64F && ddepth == CV_64F)
+                func = reduceSum2C64f64f;
+        }
    }

    if( !func )

--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@@ -85,6 +85,9 @@
 #elif defined OCL_CV_REDUCE_MIN
 #define INIT_VALUE MAX_VAL
 #define PROCESS_ELEM(acc, value) acc = min(value, acc)
+#elif defined OCL_CV_REDUCE_SUM2
+#define INIT_VALUE 0
+#define PROCESS_ELEM(acc, value) acc += value*value
 #else
 #error "No operation is specified"
 #endif

--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -108,6 +108,22 @@ extern const uchar g_Saturate8u[];
 #define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
 #define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))

+template<typename T1, typename T2=T1, typename T3=T1> struct OpNop
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a) const { return saturate_cast<T3>(a); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSqr
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a) const { return saturate_cast<T3>(a)*saturate_cast<T3>(a); }
+};
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
 {
    typedef T1 type1;
@@ -116,6 +132,14 @@ template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
 };

+template<typename T1, typename T2=T1, typename T3=T1> struct OpAddSqr
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + saturate_cast<T3>(b)*saturate_cast<T3>(b)); }
+};
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
 {
    typedef T1 type1;

--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -1873,6 +1873,22 @@ OCL_TEST_P(ReduceAvg, Mat)
    }
 }

+typedef Reduce ReduceSum2;
+
+OCL_TEST_P(ReduceSum2, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, REDUCE_SUM2, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, REDUCE_SUM2, dtype));
+
+        double eps = ddepth <= CV_32S ? 1 : 6e-6;
+        OCL_EXPECT_MATS_NEAR(dst, eps);
+    }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////

 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(::testing::Values(CV_8U, CV_8S), OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));

--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -26,7 +26,7 @@ protected:
 };

 template<class Type>
-void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim )
+void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, Mat& sum2, int dim )
 {
    CV_Assert( src.channels() == 1 );
    if( dim == 0 ) // row
@@ -34,21 +34,25 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
        sum.create( 1, src.cols, CV_64FC1 );
        max.create( 1, src.cols, CV_64FC1 );
        min.create( 1, src.cols, CV_64FC1 );
+        sum2.create( 1, src.cols, CV_64FC1 );
    }
    else
    {
        sum.create( src.rows, 1, CV_64FC1 );
        max.create( src.rows, 1, CV_64FC1 );
        min.create( src.rows, 1, CV_64FC1 );
+        sum2.create( src.rows, 1, CV_64FC1 );
    }
    sum.setTo(Scalar(0));
    max.setTo(Scalar(-DBL_MAX));
    min.setTo(Scalar(DBL_MAX));
+    sum2.setTo(Scalar(0));

    const Mat_<Type>& src_ = src;
    Mat_<double>& sum_ = (Mat_<double>&)sum;
    Mat_<double>& min_ = (Mat_<double>&)min;
    Mat_<double>& max_ = (Mat_<double>&)max;
+    Mat_<double>& sum2_ = (Mat_<double>&)sum2;

    if( dim == 0 )
    {
@@ -59,6 +63,7 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
                sum_(0, ci) += src_(ri, ci);
                max_(0, ci) = std::max( max_(0, ci), (double)src_(ri, ci) );
                min_(0, ci) = std::min( min_(0, ci), (double)src_(ri, ci) );
+                sum2_(0, ci) += ((double)src_(ri, ci))*((double)src_(ri, ci));
            }
        }
    }
@@ -71,6 +76,7 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
                sum_(ri, 0) += src_(ri, ci);
                max_(ri, 0) = std::max( max_(ri, 0), (double)src_(ri, ci) );
                min_(ri, 0) = std::min( min_(ri, 0), (double)src_(ri, ci) );
+                sum2_(ri, 0) += ((double)src_(ri, ci))*((double)src_(ri, ci));
            }
        }
    }
@@ -93,7 +99,7 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
 {
    int srcType = src.type();
    bool support = false;
-    if( opType == REDUCE_SUM || opType == REDUCE_AVG )
+    if( opType == REDUCE_SUM || opType == REDUCE_AVG || opType == REDUCE_SUM2 )
    {
        if( srcType == CV_8U && (dstType == CV_32S || dstType == CV_32F || dstType == CV_64F) )
            support = true;
@@ -128,7 +134,7 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
        return cvtest::TS::OK;

    double eps = 0.0;
-    if ( opType == REDUCE_SUM || opType == REDUCE_AVG )
+    if ( opType == REDUCE_SUM || opType == REDUCE_AVG || opType == REDUCE_SUM2 )
    {
        if ( dstType == CV_32F )
            eps = 1.e-5;
@@ -152,10 +158,13 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
    if( check )
    {
        char msg[100];
-        const char* opTypeStr = opType == REDUCE_SUM ? "REDUCE_SUM" :
-        opType == REDUCE_AVG ? "REDUCE_AVG" :
-        opType == REDUCE_MAX ? "REDUCE_MAX" :
-        opType == REDUCE_MIN ? "REDUCE_MIN" : "unknown operation type";
+        const char* opTypeStr =
+          opType == REDUCE_SUM ? "REDUCE_SUM" :
+          opType == REDUCE_AVG ? "REDUCE_AVG" :
+          opType == REDUCE_MAX ? "REDUCE_MAX" :
+          opType == REDUCE_MIN ? "REDUCE_MIN" :
+          opType == REDUCE_SUM2 ? "REDUCE_SUM2" :
+          "unknown operation type";
        string srcTypeStr, dstTypeStr;
        getMatTypeStr( src.type(), srcTypeStr );
        getMatTypeStr( dstType, dstTypeStr );
@@ -172,25 +181,25 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
 int Core_ReduceTest::checkCase( int srcType, int dstType, int dim, Size sz )
 {
    int code = cvtest::TS::OK, tempCode;
-    Mat src, sum, avg, max, min;
+    Mat src, sum, avg, max, min, sum2;

    src.create( sz, srcType );
    randu( src, Scalar(0), Scalar(100) );

    if( srcType == CV_8UC1 )
-        testReduce<uchar>( src, sum, avg, max, min, dim );
+        testReduce<uchar>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_8SC1 )
-        testReduce<char>( src, sum, avg, max, min, dim );
+        testReduce<char>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_16UC1 )
-        testReduce<unsigned short int>( src, sum, avg, max, min, dim );
+        testReduce<unsigned short int>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_16SC1 )
-        testReduce<short int>( src, sum, avg, max, min, dim );
+        testReduce<short int>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_32SC1 )
-        testReduce<int>( src, sum, avg, max, min, dim );
+        testReduce<int>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_32FC1 )
-        testReduce<float>( src, sum, avg, max, min, dim );
+        testReduce<float>( src, sum, avg, max, min, sum2, dim );
    else if( srcType == CV_64FC1 )
-        testReduce<double>( src, sum, avg, max, min, dim );
+        testReduce<double>( src, sum, avg, max, min, sum2, dim );
    else
        CV_Assert( 0 );

@@ -210,6 +219,10 @@ int Core_ReduceTest::checkCase( int srcType, int dstType, int dim, Size sz )
    tempCode = checkOp( src, dstType, REDUCE_MIN, min, dim );
    code = tempCode != cvtest::TS::OK ? tempCode : code;

+    // 5. sum2
+    tempCode = checkOp( src, dstType, REDUCE_SUM2, sum2, dim );
+    code = tempCode != cvtest::TS::OK ? tempCode : code;
+
    return code;
 }

@@ -1563,6 +1576,7 @@ TEST(Reduce, regression_should_fail_bug_4594)
    EXPECT_THROW(cv::reduce(src, dst, 0, REDUCE_MAX, CV_32S), cv::Exception);
    EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_SUM, CV_32S));
    EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_AVG, CV_32S));
+    EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_SUM2, CV_32S));
 }

 TEST(Mat, push_back_vector)

--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -3018,7 +3018,7 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy)
    cv::randu(src,cv::Scalar(-128), cv::Scalar(128));
    cv::Mat goldMean;

-    cv::reduce(src,goldMean,0 ,REDUCE_AVG, CV_32F);
+    cv::reduce(src, goldMean, 0, REDUCE_AVG, CV_32F);

    cv::calcCovarMatrix(src,gold,goldMean,singleMatFlags,CV_32F);