diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 7038da94b2d753bdb0e820485a807ead586cbae7..9b94c72a434c7f64ebe28f9bc0adb4ea8dd68e83 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -230,7 +230,8 @@ enum KmeansFlags {
 enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
                    REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
                    REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
-                   REDUCE_MIN = 3  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_MIN = 3,  //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
+                   REDUCE_SUM2 = 4  //!< the output is the sum of all squared rows/columns of the matrix.
                  };
 
 //! @} core_array
@@ -903,7 +904,7 @@ The function #reduce reduces the matrix to a vector by treating the matrix rows/
 1D vectors and performing the specified operation on the vectors until a single row/column is
 obtained. For example, the function can be used to compute horizontal and vertical projections of a
 raster image. In case of #REDUCE_MAX and #REDUCE_MIN , the output image should have the same type as the source one.
-In case of #REDUCE_SUM and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
+In case of #REDUCE_SUM, #REDUCE_SUM2 and #REDUCE_AVG , the output may have a larger element bit-depth to preserve accuracy.
 And multi-channel arrays are also supported in these two reduction modes.
 
 The following code demonstrates its usage for a single channel matrix.
diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 0cbfc2d653d9b5da24b83ce4a72b4fe3e0b41c31..526bc4e87415dfdc5660295cc96a0f6f6e41d481 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -1150,7 +1150,7 @@ OCL_PERF_TEST_P(ReduceMinMaxFixture, Reduce,
     SANITY_CHECK(dst, eps);
 }
 
-CV_ENUM(ReduceAccOp, CV_REDUCE_SUM, CV_REDUCE_AVG)
+CV_ENUM(ReduceAccOp, REDUCE_SUM, REDUCE_AVG, REDUCE_SUM2)
 
 typedef tuple<Size, std::pair<MatType, MatType>, int, ReduceAccOp> ReduceAccParams;
 typedef TestBaseWithParam<ReduceAccParams> ReduceAccFixture;
@@ -1168,7 +1168,6 @@ OCL_PERF_TEST_P(ReduceAccFixture, Reduce,
             dim = get<2>(params), op = get<3>(params);
     const Size srcSize = get<0>(params),
             dstSize(dim == 0 ? srcSize.width : 1, dim == 0 ? 1 : srcSize.height);
-    const double eps = CV_MAT_DEPTH(dtype) <= CV_32S ? 1 : 3e-4;
 
     checkDeviceMaxMemoryAllocSize(srcSize, stype);
     checkDeviceMaxMemoryAllocSize(srcSize, dtype);
@@ -1178,7 +1177,7 @@ OCL_PERF_TEST_P(ReduceAccFixture, Reduce,
 
     OCL_TEST_CYCLE() cv::reduce(src, dst, dim, op, dtype);
 
-    SANITY_CHECK(dst, eps);
+    SANITY_CHECK_NOTHING();
 }
 
 } } // namespace opencv_test::ocl
diff --git a/modules/core/perf/perf_reduce.cpp b/modules/core/perf/perf_reduce.cpp
index dcc0205fdc04b106e80118e98288164fb2f41631..844303aa7de2d22e27dd00262461be4b684fa355 100644
--- a/modules/core/perf/perf_reduce.cpp
+++ b/modules/core/perf/perf_reduce.cpp
@@ -5,7 +5,7 @@ namespace opencv_test
 {
 using namespace perf;
 
-CV_ENUM(ROp, CV_REDUCE_SUM, CV_REDUCE_AVG, CV_REDUCE_MAX, CV_REDUCE_MIN)
+CV_ENUM(ROp, REDUCE_SUM, REDUCE_AVG, REDUCE_MAX, REDUCE_MIN, REDUCE_SUM2)
 typedef tuple<Size, MatType, ROp> Size_MatType_ROp_t;
 typedef perf::TestBaseWithParam<Size_MatType_ROp_t> Size_MatType_ROp;
 
@@ -23,7 +23,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
     int reduceOp = get<2>(GetParam());
 
     int ddepth = -1;
-    if( CV_MAT_DEPTH(matType) < CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG) )
+    if( CV_MAT_DEPTH(matType) < CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_SUM2) )
         ddepth = CV_32S;
 
     Mat src(sz, matType);
@@ -35,7 +35,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceR,
     int runs = 15;
     TEST_CYCLE_MULTIRUN(runs) reduce(src, vec, 0, reduceOp, ddepth);
 
-    SANITY_CHECK(vec, 1);
+    SANITY_CHECK_NOTHING();
 }
 
 PERF_TEST_P(Size_MatType_ROp, reduceC,
@@ -51,7 +51,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,
     int reduceOp = get<2>(GetParam());
 
     int ddepth = -1;
-    if( CV_MAT_DEPTH(matType)< CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG) )
+    if( CV_MAT_DEPTH(matType)< CV_32S && (reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_SUM2) )
         ddepth = CV_32S;
 
     Mat src(sz, matType);
@@ -62,7 +62,7 @@ PERF_TEST_P(Size_MatType_ROp, reduceC,
 
     TEST_CYCLE() reduce(src, vec, 1, reduceOp, ddepth);
 
-    SANITY_CHECK(vec, 1);
+    SANITY_CHECK_NOTHING();
 }
 
 typedef tuple<Size, MatType, int> Size_MatType_RMode_t;
diff --git a/modules/core/src/matrix_operations.cpp b/modules/core/src/matrix_operations.cpp
index 4582451c1c836b30afe27ac33d37f405617bc4e8..94e0c2b50b89fbb5769ee10ff8c8e7ae05672f5c 100644
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -341,29 +341,32 @@ cv::Mat cv::Mat::cross(InputArray _m) const
 namespace cv
 {
 
-template<typename T, typename ST, class Op> static void
-reduceR_( const Mat& srcmat, Mat& dstmat )
+template<typename T, typename ST, typename WT, class Op, class OpInit>
+class ReduceR_Invoker : public ParallelLoopBody
 {
-    typedef typename Op::rtype WT;
-    Size size = srcmat.size();
-    size.width *= srcmat.channels();
-    AutoBuffer<WT> buffer(size.width);
+public:
+  ReduceR_Invoker(const Mat& aSrcmat, Mat& aDstmat, Op& aOp, OpInit& aOpInit)
+                 :srcmat(aSrcmat),dstmat(aDstmat),op(aOp),opInit(aOpInit),buffer(srcmat.size().width*srcmat.channels())
+  {
+  }
+  void operator()(const Range& range) const CV_OVERRIDE
+  {
+    const T* src = srcmat.ptr<T>();
+    const size_t srcstep = srcmat.step/sizeof(src[0]);
     WT* buf = buffer.data();
     ST* dst = dstmat.ptr<ST>();
-    const T* src = srcmat.ptr<T>();
-    size_t srcstep = srcmat.step/sizeof(src[0]);
-    int i;
-    Op op;
+    int i = 0;
 
-    for( i = 0; i < size.width; i++ )
-        buf[i] = src[i];
+    for( i = range.start ; i < range.end; i++ )
+        buf[i] = opInit(src[i]);
 
-    for( ; --size.height; )
+    int height = srcmat.size().height;
+    for( ; --height; )
     {
         src += srcstep;
-        i = 0;
+        i = range.start;
         #if CV_ENABLE_UNROLLED
-        for(; i <= size.width - 4; i += 4 )
+        for(; i <= range.end - 4; i += 4 )
         {
             WT s0, s1;
             s0 = op(buf[i], (WT)src[i]);
@@ -375,63 +378,94 @@ reduceR_( const Mat& srcmat, Mat& dstmat )
             buf[i+2] = s0; buf[i+3] = s1;
         }
         #endif
-        for( ; i < size.width; i++ )
+        for( ; i < range.end; i++ )
             buf[i] = op(buf[i], (WT)src[i]);
     }
 
-    for( i = 0; i < size.width; i++ )
+    for( i = range.start ; i < range.end; i++ )
         dst[i] = (ST)buf[i];
-}
-
+  }
+private:
+  const Mat& srcmat;
+  Mat& dstmat;
+  Op& op;
+  OpInit& opInit;
+  mutable AutoBuffer<WT> buffer;
+};
 
-template<typename T, typename ST, class Op> static void
-reduceC_( const Mat& srcmat, Mat& dstmat )
+template<typename T, typename ST, class Op, class OpInit = OpNop<ST> > static void
+reduceR_( const Mat& srcmat, Mat& dstmat)
 {
     typedef typename Op::rtype WT;
-    Size size = srcmat.size();
-    int cn = srcmat.channels();
-    size.width *= cn;
     Op op;
+    OpInit opInit;
+
+    ReduceR_Invoker<T, ST, WT, Op, OpInit> body(srcmat, dstmat, op, opInit);
+    //group columns by 64 bytes for data locality
+    parallel_for_(Range(0, srcmat.size().width*srcmat.channels()), body, srcmat.size().width*CV_ELEM_SIZE(srcmat.depth())/64);
+}
 
-    for( int y = 0; y < size.height; y++ )
+template<typename T, typename ST, typename WT, class Op, class OpInit>
+class ReduceC_Invoker : public ParallelLoopBody
+{
+public:
+  ReduceC_Invoker(const Mat& aSrcmat, Mat& aDstmat, Op& aOp, OpInit& aOpInit)
+                 :srcmat(aSrcmat),dstmat(aDstmat),op(aOp),opInit(aOpInit)
+  {
+  }
+  void operator()(const Range& range) const CV_OVERRIDE
+  {
+    const int cn = srcmat.channels();
+    const int width = srcmat.size().width*cn;
+    AutoBuffer<WT> cumul(cn);
+    for( int y = range.start; y < range.end; y++ )
     {
         const T* src = srcmat.ptr<T>(y);
         ST* dst = dstmat.ptr<ST>(y);
-        if( size.width == cn )
-            for( int k = 0; k < cn; k++ )
-                dst[k] = src[k];
+        if( width == cn )
+        {
+          for( int k = 0; k < cn; k++ )
+              dst[k] = (ST)opInit(src[k]);
+        }
         else
         {
-            for( int k = 0; k < cn; k++ )
+            for(int k = 0; k < cn ; ++k )
+              cumul[k] = opInit(src[k]);
+            for(int k = cn ; k < width ; k += cn )
             {
-                WT a0 = src[k], a1 = src[k+cn];
-                int i;
-                for( i = 2*cn; i <= size.width - 4*cn; i += 4*cn )
-                {
-                    a0 = op(a0, (WT)src[i+k]);
-                    a1 = op(a1, (WT)src[i+k+cn]);
-                    a0 = op(a0, (WT)src[i+k+cn*2]);
-                    a1 = op(a1, (WT)src[i+k+cn*3]);
-                }
-
-                for( ; i < size.width; i += cn )
-                {
-                    a0 = op(a0, (WT)src[i+k]);
-                }
-                a0 = op(a0, a1);
-                dst[k] = (ST)a0;
+                for (int c = 0 ; c < cn ; ++c)
+                  cumul[c] = op(cumul[c], src[k+c]);
             }
+            for(int k = 0 ; k < cn ; ++k )
+              dst[k] = (ST)cumul[k];
         }
     }
+  }
+private:
+  const Mat& srcmat;
+  Mat& dstmat;
+  Op& op;
+  OpInit& opInit;
+};
+
+template<typename T, typename ST, class Op, class OpInit = OpNop<ST> > static void
+reduceC_( const Mat& srcmat, Mat& dstmat)
+{
+    typedef typename Op::rtype WT;
+    Op op;
+    OpInit opInit;
+
+    ReduceC_Invoker<T, ST, WT, Op, OpInit> body(srcmat, dstmat, op, opInit);
+    parallel_for_(Range(0, srcmat.size().height), body);
 }
 
 typedef void (*ReduceFunc)( const Mat& src, Mat& dst );
 
 }
 
-#define reduceSumR8u32s  reduceR_<uchar, int,   OpAdd<int> >
-#define reduceSumR8u32f  reduceR_<uchar, float, OpAdd<int> >
-#define reduceSumR8u64f  reduceR_<uchar, double,OpAdd<int> >
+#define reduceSumR8u32s  reduceR_<uchar, int,   OpAdd<int>, OpNop<int> >
+#define reduceSumR8u32f  reduceR_<uchar, float, OpAdd<int>, OpNop<int> >
+#define reduceSumR8u64f  reduceR_<uchar, double,OpAdd<int>, OpNop<int> >
 #define reduceSumR16u32f reduceR_<ushort,float, OpAdd<float> >
 #define reduceSumR16u64f reduceR_<ushort,double,OpAdd<double> >
 #define reduceSumR16s32f reduceR_<short, float, OpAdd<float> >
@@ -440,6 +474,17 @@ typedef void (*ReduceFunc)( const Mat& src, Mat& dst );
 #define reduceSumR32f64f reduceR_<float, double,OpAdd<double> >
 #define reduceSumR64f64f reduceR_<double,double,OpAdd<double> >
 
+#define reduceSum2R8u32s  reduceR_<uchar, int,   OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R8u32f  reduceR_<uchar, float, OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R8u64f  reduceR_<uchar, double,OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2R16u32f reduceR_<ushort,float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R16u64f reduceR_<ushort,double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R16s32f reduceR_<short, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R16s64f reduceR_<short, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R32f32f reduceR_<float, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2R32f64f reduceR_<float, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2R64f64f reduceR_<double,double,OpAddSqr<double>,OpSqr<double> >
+
 #define reduceMaxR8u  reduceR_<uchar, uchar, OpMax<uchar> >
 #define reduceMaxR16u reduceR_<ushort,ushort,OpMax<ushort> >
 #define reduceMaxR16s reduceR_<short, short, OpMax<short> >
@@ -527,23 +572,35 @@ static inline void reduceSumC_8u16u16s32f_64f(const cv::Mat& srcmat, cv::Mat& ds
 
 #endif
 
-#define reduceSumC8u32s  reduceC_<uchar, int,   OpAdd<int> >
-#define reduceSumC8u32f  reduceC_<uchar, float, OpAdd<int> >
+#define reduceSumC8u32s  reduceC_<uchar, int,   OpAdd<int>, OpNop<int> >
+#define reduceSumC8u32f  reduceC_<uchar, float, OpAdd<int>, OpNop<int> >
 #define reduceSumC16u32f reduceC_<ushort,float, OpAdd<float> >
 #define reduceSumC16s32f reduceC_<short, float, OpAdd<float> >
 #define reduceSumC32f32f reduceC_<float, float, OpAdd<float> >
 #define reduceSumC64f64f reduceC_<double,double,OpAdd<double> >
 
+#define reduceSum2C8u32s  reduceC_<uchar, int,   OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C8u32f  reduceC_<uchar, float, OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C16u32f reduceC_<ushort,float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C16s32f reduceC_<short, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C32f32f reduceC_<float, float, OpAddSqr<float>, OpSqr<float> >
+#define reduceSum2C64f64f reduceC_<double,double,OpAddSqr<double>,OpSqr<double> >
+
 #ifdef HAVE_IPP
 #define reduceSumC8u64f  reduceSumC_8u16u16s32f_64f
 #define reduceSumC16u64f reduceSumC_8u16u16s32f_64f
 #define reduceSumC16s64f reduceSumC_8u16u16s32f_64f
 #define reduceSumC32f64f reduceSumC_8u16u16s32f_64f
 #else
-#define reduceSumC8u64f  reduceC_<uchar, double,OpAdd<int> >
+#define reduceSumC8u64f  reduceC_<uchar, double,OpAdd<int>, OpNop<int> >
 #define reduceSumC16u64f reduceC_<ushort,double,OpAdd<double> >
 #define reduceSumC16s64f reduceC_<short, double,OpAdd<double> >
 #define reduceSumC32f64f reduceC_<float, double,OpAdd<double> >
+
+#define reduceSum2C8u64f  reduceC_<uchar, double,OpAddSqr<int>,   OpSqr<int> >
+#define reduceSum2C16u64f reduceC_<ushort,double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2C16s64f reduceC_<short, double,OpAddSqr<double>,OpSqr<double> >
+#define reduceSum2C32f64f reduceC_<float, double,OpAddSqr<double>,OpSqr<double> >
 #endif
 
 #ifdef HAVE_IPP
@@ -622,8 +679,9 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
             ddepth = CV_32S;
     }
 
-    const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
-                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
+    const char * const ops[5] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
+                                  "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN",
+                                  "OCL_CV_REDUCE_SUM2"};
     int wdepth = std::max(ddepth, CV_32F);
     if (useOptimized)
     {
@@ -718,7 +776,8 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
 
     CV_Assert( cn == CV_MAT_CN(dtype) );
     CV_Assert( op == REDUCE_SUM || op == REDUCE_MAX ||
-               op == REDUCE_MIN || op == REDUCE_AVG );
+               op == REDUCE_MIN || op == REDUCE_AVG ||
+               op == REDUCE_SUM2);
 
     CV_OCL_RUN(_dst.isUMat(),
                ocl_reduce(_src, _dst, dim, op, op0, stype, dtype))
@@ -748,7 +807,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
         if( op == REDUCE_SUM )
         {
             if(sdepth == CV_8U && ddepth == CV_32S)
-                func = GET_OPTIMIZED(reduceSumR8u32s);
+                func = reduceSumR8u32s;
             else if(sdepth == CV_8U && ddepth == CV_32F)
                 func = reduceSumR8u32f;
             else if(sdepth == CV_8U && ddepth == CV_64F)
@@ -762,7 +821,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
             else if(sdepth == CV_16S && ddepth == CV_64F)
                 func = reduceSumR16s64f;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceSumR32f32f);
+                func = reduceSumR32f32f;
             else if(sdepth == CV_32F && ddepth == CV_64F)
                 func = reduceSumR32f64f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
@@ -771,36 +830,59 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
         else if(op == REDUCE_MAX)
         {
             if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMaxR8u);
+                func = reduceMaxR8u;
             else if(sdepth == CV_16U && ddepth == CV_16U)
                 func = reduceMaxR16u;
             else if(sdepth == CV_16S && ddepth == CV_16S)
                 func = reduceMaxR16s;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMaxR32f);
+                func = reduceMaxR32f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
                 func = reduceMaxR64f;
         }
         else if(op == REDUCE_MIN)
         {
             if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMinR8u);
+                func = reduceMinR8u;
             else if(sdepth == CV_16U && ddepth == CV_16U)
                 func = reduceMinR16u;
             else if(sdepth == CV_16S && ddepth == CV_16S)
                 func = reduceMinR16s;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMinR32f);
+                func = reduceMinR32f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
                 func = reduceMinR64f;
         }
+        else if( op == REDUCE_SUM2 )
+        {
+            if(sdepth == CV_8U && ddepth == CV_32S)
+                func = reduceSum2R8u32s;
+            else if(sdepth == CV_8U && ddepth == CV_32F)
+                func = reduceSum2R8u32f;
+            else if(sdepth == CV_8U && ddepth == CV_64F)
+                func = reduceSum2R8u64f;
+            else if(sdepth == CV_16U && ddepth == CV_32F)
+                func = reduceSum2R16u32f;
+            else if(sdepth == CV_16U && ddepth == CV_64F)
+                func = reduceSum2R16u64f;
+            else if(sdepth == CV_16S && ddepth == CV_32F)
+                func = reduceSum2R16s32f;
+            else if(sdepth == CV_16S && ddepth == CV_64F)
+                func = reduceSum2R16s64f;
+            else if(sdepth == CV_32F && ddepth == CV_32F)
+                func = reduceSum2R32f32f;
+            else if(sdepth == CV_32F && ddepth == CV_64F)
+                func = reduceSum2R32f64f;
+            else if(sdepth == CV_64F && ddepth == CV_64F)
+                func = reduceSum2R64f64f;
+        }
     }
     else
     {
         if(op == REDUCE_SUM)
         {
             if(sdepth == CV_8U && ddepth == CV_32S)
-                func = GET_OPTIMIZED(reduceSumC8u32s);
+                func = reduceSumC8u32s;
             else if(sdepth == CV_8U && ddepth == CV_32F)
                 func = reduceSumC8u32f;
             else if(sdepth == CV_8U && ddepth == CV_64F)
@@ -814,7 +896,7 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
             else if(sdepth == CV_16S && ddepth == CV_64F)
                 func = reduceSumC16s64f;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceSumC32f32f);
+                func = reduceSumC32f32f;
             else if(sdepth == CV_32F && ddepth == CV_64F)
                 func = reduceSumC32f64f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
@@ -823,29 +905,52 @@ void cv::reduce(InputArray _src, OutputArray _dst, int dim, int op, int dtype)
         else if(op == REDUCE_MAX)
         {
             if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMaxC8u);
+                func = reduceMaxC8u;
             else if(sdepth == CV_16U && ddepth == CV_16U)
                 func = reduceMaxC16u;
             else if(sdepth == CV_16S && ddepth == CV_16S)
                 func = reduceMaxC16s;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMaxC32f);
+                func = reduceMaxC32f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
                 func = reduceMaxC64f;
         }
         else if(op == REDUCE_MIN)
         {
             if(sdepth == CV_8U && ddepth == CV_8U)
-                func = GET_OPTIMIZED(reduceMinC8u);
+                func = reduceMinC8u;
             else if(sdepth == CV_16U && ddepth == CV_16U)
                 func = reduceMinC16u;
             else if(sdepth == CV_16S && ddepth == CV_16S)
                 func = reduceMinC16s;
             else if(sdepth == CV_32F && ddepth == CV_32F)
-                func = GET_OPTIMIZED(reduceMinC32f);
+                func = reduceMinC32f;
             else if(sdepth == CV_64F && ddepth == CV_64F)
                 func = reduceMinC64f;
         }
+        else if(op == REDUCE_SUM2)
+        {
+            if(sdepth == CV_8U && ddepth == CV_32S)
+                func = reduceSum2C8u32s;
+            else if(sdepth == CV_8U && ddepth == CV_32F)
+                func = reduceSum2C8u32f;
+            else if(sdepth == CV_8U && ddepth == CV_64F)
+                func = reduceSum2C8u64f;
+            else if(sdepth == CV_16U && ddepth == CV_32F)
+                func = reduceSum2C16u32f;
+            else if(sdepth == CV_16U && ddepth == CV_64F)
+                func = reduceSum2C16u64f;
+            else if(sdepth == CV_16S && ddepth == CV_32F)
+                func = reduceSum2C16s32f;
+            else if(sdepth == CV_16S && ddepth == CV_64F)
+                func = reduceSum2C16s64f;
+            else if(sdepth == CV_32F && ddepth == CV_32F)
+                func = reduceSum2C32f32f;
+            else if(sdepth == CV_32F && ddepth == CV_64F)
+                func = reduceSum2C32f64f;
+            else if(sdepth == CV_64F && ddepth == CV_64F)
+                func = reduceSum2C64f64f;
+        }
     }
 
     if( !func )
diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl
index 645d69867b8c31c06b82117587a1f2a0c5b78553..380305a6798b92136dc2ca683a8c21517eb644cf 100644
--- a/modules/core/src/opencl/reduce2.cl
+++ b/modules/core/src/opencl/reduce2.cl
@@ -85,6 +85,9 @@
 #elif defined OCL_CV_REDUCE_MIN
 #define INIT_VALUE MAX_VAL
 #define PROCESS_ELEM(acc, value) acc = min(value, acc)
+#elif defined OCL_CV_REDUCE_SUM2
+#define INIT_VALUE 0
+#define PROCESS_ELEM(acc, value) acc += value*value
 #else
 #error "No operation is specified"
 #endif
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 33c3b3291ce5f278b0183b3a91d5681bb638d1f2..1b0945d4127b9e9418652f77d086d2a12a6c8f51 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -108,6 +108,22 @@ extern const uchar g_Saturate8u[];
 #define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
 #define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
 
+template<typename T1, typename T2=T1, typename T3=T1> struct OpNop
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a) const { return saturate_cast<T3>(a); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSqr
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a) const { return saturate_cast<T3>(a)*saturate_cast<T3>(a); }
+};
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
 {
     typedef T1 type1;
@@ -116,6 +132,14 @@ template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
     T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
 };
 
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAddSqr
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + saturate_cast<T3>(b)*saturate_cast<T3>(b)); }
+};
+
 template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
 {
     typedef T1 type1;
diff --git a/modules/core/test/ocl/test_arithm.cpp b/modules/core/test/ocl/test_arithm.cpp
index 20e3a177553b1223577f17b6f0c470039240df63..da7a003f11fee82a4a3fee6cdb6cb722f8a23003 100644
--- a/modules/core/test/ocl/test_arithm.cpp
+++ b/modules/core/test/ocl/test_arithm.cpp
@@ -1873,6 +1873,22 @@ OCL_TEST_P(ReduceAvg, Mat)
     }
 }
 
+typedef Reduce ReduceSum2;
+
+OCL_TEST_P(ReduceSum2, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::reduce(src_roi, dst_roi, dim, REDUCE_SUM2, dtype));
+        OCL_ON(cv::reduce(usrc_roi, udst_roi, dim, REDUCE_SUM2, dtype));
+
+        double eps = ddepth <= CV_32S ? 1 : 6e-6;
+        OCL_EXPECT_MATS_NEAR(dst, eps);
+    }
+}
+
 //////////////////////////////////////// Instantiation /////////////////////////////////////////
 
 OCL_INSTANTIATE_TEST_CASE_P(Arithm, Lut, Combine(::testing::Values(CV_8U, CV_8S), OCL_ALL_DEPTHS, OCL_ALL_CHANNELS, Bool(), Bool()));
diff --git a/modules/core/test/test_mat.cpp b/modules/core/test/test_mat.cpp
index e39b16a1e52d4202281eb24084af1cd225f93639..2d6019eac4d3e174df109f7e9c475f5478ccf5b5 100644
--- a/modules/core/test/test_mat.cpp
+++ b/modules/core/test/test_mat.cpp
@@ -26,7 +26,7 @@ protected:
 };
 
 template<class Type>
-void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim )
+void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, Mat& sum2, int dim )
 {
     CV_Assert( src.channels() == 1 );
     if( dim == 0 ) // row
@@ -34,21 +34,25 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
         sum.create( 1, src.cols, CV_64FC1 );
         max.create( 1, src.cols, CV_64FC1 );
         min.create( 1, src.cols, CV_64FC1 );
+        sum2.create( 1, src.cols, CV_64FC1 );
     }
     else
     {
         sum.create( src.rows, 1, CV_64FC1 );
         max.create( src.rows, 1, CV_64FC1 );
         min.create( src.rows, 1, CV_64FC1 );
+        sum2.create( src.rows, 1, CV_64FC1 );
     }
     sum.setTo(Scalar(0));
     max.setTo(Scalar(-DBL_MAX));
     min.setTo(Scalar(DBL_MAX));
+    sum2.setTo(Scalar(0));
 
     const Mat_<Type>& src_ = src;
     Mat_<double>& sum_ = (Mat_<double>&)sum;
     Mat_<double>& min_ = (Mat_<double>&)min;
     Mat_<double>& max_ = (Mat_<double>&)max;
+    Mat_<double>& sum2_ = (Mat_<double>&)sum2;
 
     if( dim == 0 )
     {
@@ -59,6 +63,7 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
                 sum_(0, ci) += src_(ri, ci);
                 max_(0, ci) = std::max( max_(0, ci), (double)src_(ri, ci) );
                 min_(0, ci) = std::min( min_(0, ci), (double)src_(ri, ci) );
+                sum2_(0, ci) += ((double)src_(ri, ci))*((double)src_(ri, ci));
             }
         }
     }
@@ -71,6 +76,7 @@ void testReduce( const Mat& src, Mat& sum, Mat& avg, Mat& max, Mat& min, int dim
                 sum_(ri, 0) += src_(ri, ci);
                 max_(ri, 0) = std::max( max_(ri, 0), (double)src_(ri, ci) );
                 min_(ri, 0) = std::min( min_(ri, 0), (double)src_(ri, ci) );
+                sum2_(ri, 0) += ((double)src_(ri, ci))*((double)src_(ri, ci));
             }
         }
     }
@@ -93,7 +99,7 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
 {
     int srcType = src.type();
     bool support = false;
-    if( opType == REDUCE_SUM || opType == REDUCE_AVG )
+    if( opType == REDUCE_SUM || opType == REDUCE_AVG || opType == REDUCE_SUM2 )
     {
         if( srcType == CV_8U && (dstType == CV_32S || dstType == CV_32F || dstType == CV_64F) )
             support = true;
@@ -128,7 +134,7 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
         return cvtest::TS::OK;
 
     double eps = 0.0;
-    if ( opType == REDUCE_SUM || opType == REDUCE_AVG )
+    if ( opType == REDUCE_SUM || opType == REDUCE_AVG || opType == REDUCE_SUM2 )
     {
         if ( dstType == CV_32F )
             eps = 1.e-5;
@@ -152,10 +158,13 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
     if( check )
     {
         char msg[100];
-        const char* opTypeStr = opType == REDUCE_SUM ? "REDUCE_SUM" :
-        opType == REDUCE_AVG ? "REDUCE_AVG" :
-        opType == REDUCE_MAX ? "REDUCE_MAX" :
-        opType == REDUCE_MIN ? "REDUCE_MIN" : "unknown operation type";
+        const char* opTypeStr =
+          opType == REDUCE_SUM ? "REDUCE_SUM" :
+          opType == REDUCE_AVG ? "REDUCE_AVG" :
+          opType == REDUCE_MAX ? "REDUCE_MAX" :
+          opType == REDUCE_MIN ? "REDUCE_MIN" :
+          opType == REDUCE_SUM2 ? "REDUCE_SUM2" :
+          "unknown operation type";
         string srcTypeStr, dstTypeStr;
         getMatTypeStr( src.type(), srcTypeStr );
         getMatTypeStr( dstType, dstTypeStr );
@@ -172,25 +181,25 @@ int Core_ReduceTest::checkOp( const Mat& src, int dstType, int opType, const Mat
 int Core_ReduceTest::checkCase( int srcType, int dstType, int dim, Size sz )
 {
     int code = cvtest::TS::OK, tempCode;
-    Mat src, sum, avg, max, min;
+    Mat src, sum, avg, max, min, sum2;
 
     src.create( sz, srcType );
     randu( src, Scalar(0), Scalar(100) );
 
     if( srcType == CV_8UC1 )
-        testReduce<uchar>( src, sum, avg, max, min, dim );
+        testReduce<uchar>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_8SC1 )
-        testReduce<char>( src, sum, avg, max, min, dim );
+        testReduce<char>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_16UC1 )
-        testReduce<unsigned short int>( src, sum, avg, max, min, dim );
+        testReduce<unsigned short int>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_16SC1 )
-        testReduce<short int>( src, sum, avg, max, min, dim );
+        testReduce<short int>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_32SC1 )
-        testReduce<int>( src, sum, avg, max, min, dim );
+        testReduce<int>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_32FC1 )
-        testReduce<float>( src, sum, avg, max, min, dim );
+        testReduce<float>( src, sum, avg, max, min, sum2, dim );
     else if( srcType == CV_64FC1 )
-        testReduce<double>( src, sum, avg, max, min, dim );
+        testReduce<double>( src, sum, avg, max, min, sum2, dim );
     else
         CV_Assert( 0 );
 
@@ -210,6 +219,10 @@ int Core_ReduceTest::checkCase( int srcType, int dstType, int dim, Size sz )
     tempCode = checkOp( src, dstType, REDUCE_MIN, min, dim );
     code = tempCode != cvtest::TS::OK ? tempCode : code;
 
+    // 5. sum2
+    tempCode = checkOp( src, dstType, REDUCE_SUM2, sum2, dim );
+    code = tempCode != cvtest::TS::OK ? tempCode : code;
+
     return code;
 }
 
@@ -1563,6 +1576,7 @@ TEST(Reduce, regression_should_fail_bug_4594)
     EXPECT_THROW(cv::reduce(src, dst, 0, REDUCE_MAX, CV_32S), cv::Exception);
     EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_SUM, CV_32S));
     EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_AVG, CV_32S));
+    EXPECT_NO_THROW(cv::reduce(src, dst, 0, REDUCE_SUM2, CV_32S));
 }
 
 TEST(Mat, push_back_vector)
diff --git a/modules/core/test/test_math.cpp b/modules/core/test/test_math.cpp
index f15f9849578c183ee7a487ffba4bc4b9cacbd583..cffe1bb5378b8eac480bf517a05705d7ce0b2895 100644
--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -3018,7 +3018,7 @@ TEST(CovariationMatrixVectorOfMatWithMean, accuracy)
     cv::randu(src,cv::Scalar(-128), cv::Scalar(128));
     cv::Mat goldMean;
 
-    cv::reduce(src,goldMean,0 ,REDUCE_AVG, CV_32F);
+    cv::reduce(src, goldMean, 0, REDUCE_AVG, CV_32F);
 
     cv::calcCovarMatrix(src,gold,goldMean,singleMatFlags,CV_32F);