diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05696527d59e32406c705f8785db749368a165f0..deed0a64851bd605156dbc427cf7b6dd39aa91cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,6 +587,11 @@ include(cmake/OpenCVFindMatlab.cmake)
 
 include(cmake/OpenCVDetectVTK.cmake)
 
+if (OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
+  get_filename_component(OPENCV_HAL_HEADERS "${OPENCV_HAL_HEADERS}" ABSOLUTE)
+  get_filename_component(OPENCV_HAL_LIBS "${OPENCV_HAL_LIBS}" ABSOLUTE)
+endif()
+
 # ----------------------------------------------------------------------------
 # Add CUDA libraries (needed for apps/tools, samples)
 # ----------------------------------------------------------------------------
diff --git a/cmake/templates/custom_hal.hpp.in b/cmake/templates/custom_hal.hpp.in
new file mode 100644
index 0000000000000000000000000000000000000000..b298a033ec0e8bc109e43ecd85bcbc8f88d11de2
--- /dev/null
+++ b/cmake/templates/custom_hal.hpp.in
@@ -0,0 +1,6 @@
+#ifndef _CUSTOM_HAL_INCLUDED_
+#define _CUSTOM_HAL_INCLUDED_
+
+@OPENCV_HAL_HEADERS_INCLUDES@
+
+#endif
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index a3c40f56011b8a46af7b9071d46e3741ee07cc27..a8a0b23e1225b648dd0b695efcd271836bbdfb6a 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -679,89 +679,8 @@ CV_EXPORTS void setUseIPP(bool flag);
 
 //! @} core_utils
 
-//! @addtogroup core_utils_neon
-//! @{
-
-#if CV_NEON
-
-inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
-{
-    static int32x2_t v_sign = vdup_n_s32(1 << 31),
-        v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
-
-    int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
-    return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
-}
-
-inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
-{
-    static int32x4_t v_sign = vdupq_n_s32(1 << 31),
-        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
-
-    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
-    return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
-}
-
-inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
-{
-    static float32x2_t v_05 = vdup_n_f32(0.5f);
-    return vcvt_u32_f32(vadd_f32(v, v_05));
-}
-
-inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
-{
-    static float32x4_t v_05 = vdupq_n_f32(0.5f);
-    return vcvtq_u32_f32(vaddq_f32(v, v_05));
-}
-
-inline float32x4_t cv_vrecpq_f32(float32x4_t val)
-{
-    float32x4_t reciprocal = vrecpeq_f32(val);
-    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
-    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
-    return reciprocal;
-}
-
-inline float32x2_t cv_vrecp_f32(float32x2_t val)
-{
-    float32x2_t reciprocal = vrecpe_f32(val);
-    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
-    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
-    return reciprocal;
-}
-
-inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
-{
-    float32x4_t e = vrsqrteq_f32(val);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
-    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
-    return e;
-}
-
-inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
-{
-    float32x2_t e = vrsqrte_f32(val);
-    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
-    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
-    return e;
-}
-
-inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
-{
-    return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
-}
-
-inline float32x2_t cv_vsqrt_f32(float32x2_t val)
-{
-    return cv_vrecp_f32(cv_vrsqrt_f32(val));
-}
-
-#endif
-
-//! @} core_utils_neon
-
 } // cv
 
-#include "sse_utils.hpp"
+#include "opencv2/hal/neon_utils.hpp"
 
 #endif //__OPENCV_CORE_BASE_HPP__
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index 4d7d7df6682965e963e1650bccee658bd77420d6..b66ade5c1724ebfa7c732272da1cd518eed3d074 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -277,37 +277,6 @@ execution time.
  */
 CV_EXPORTS_W int64 getCPUTickCount();
 
-/** @brief Available CPU features.
-
-remember to keep this list identical to the one in cvdef.h
-*/
-enum CpuFeatures {
-    CPU_MMX             = 1,
-    CPU_SSE             = 2,
-    CPU_SSE2            = 3,
-    CPU_SSE3            = 4,
-    CPU_SSSE3           = 5,
-    CPU_SSE4_1          = 6,
-    CPU_SSE4_2          = 7,
-    CPU_POPCNT          = 8,
-
-    CPU_AVX             = 10,
-    CPU_AVX2            = 11,
-    CPU_FMA3            = 12,
-
-    CPU_AVX_512F        = 13,
-    CPU_AVX_512BW       = 14,
-    CPU_AVX_512CD       = 15,
-    CPU_AVX_512DQ       = 16,
-    CPU_AVX_512ER       = 17,
-    CPU_AVX_512IFMA512  = 18,
-    CPU_AVX_512PF       = 19,
-    CPU_AVX_512VBMI     = 20,
-    CPU_AVX_512VL       = 21,
-
-    CPU_NEON            = 100
-};
-
 /** @brief Returns true if the specified feature is supported by the host hardware.
 
 The function returns true if the host hardware supports the specified feature. When user calls
diff --git a/modules/core/src/arithm.cpp b/modules/core/src/arithm.cpp
index 6ad72461db7e2f9be277731dbe9d5f5ebe87520e..06cd7916e202c8fa4da4201331089fee6d4bfe59 100644
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -53,1415 +53,462 @@
 namespace cv
 {
 
-struct NOP {};
+/****************************************************************************************\
+*                                   logical operations                                   *
+\****************************************************************************************/
+
+void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
+{
+    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
+    size_t esz = CV_ELEM_SIZE(buftype);
+    getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
+    // unroll the scalar
+    if( scn < cn )
+    {
+        CV_Assert( scn == 1 );
+        size_t esz1 = CV_ELEM_SIZE1(buftype);
+        for( size_t i = esz1; i < esz; i++ )
+            scbuf[i] = scbuf[i - esz1];
+    }
+    for( size_t i = esz; i < blocksize*esz; i++ )
+        scbuf[i] = scbuf[i - esz];
+}
 
-#if CV_SSE2 || CV_NEON
 
-#define FUNCTOR_TEMPLATE(name)          \
-    template<typename T> struct name {}
+enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
+       OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
+       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
+       OCL_OP_RDIV_SCALE=15 };
 
-FUNCTOR_TEMPLATE(VLoadStore128);
-#if CV_SSE2
-FUNCTOR_TEMPLATE(VLoadStore64);
-FUNCTOR_TEMPLATE(VLoadStore128Aligned);
-#if CV_AVX2
-FUNCTOR_TEMPLATE(VLoadStore256);
-FUNCTOR_TEMPLATE(VLoadStore256Aligned);
-#endif
-#endif
+#ifdef HAVE_OPENCL
 
-#endif
+static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
+    "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
+    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
 
-template<typename T, class Op, class VOp>
-void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
+static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, bool bitwise, int oclop, bool haveScalar )
 {
-#if CV_SSE2 || CV_NEON
-    VOp vop;
-#endif
-    Op op;
+    bool haveMask = !_mask.empty();
+    int srctype = _src1.type();
+    int srcdepth = CV_MAT_DEPTH(srctype);
+    int cn = CV_MAT_CN(srctype);
 
-    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
-    {
-        int x = 0;
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
+    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
+            (!doubleSupport && srcdepth == CV_64F && !bitwise))
+        return false;
 
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
-                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
-                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
-                VLoadStore128<T>::store(dst + x               , r0);
-                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
-
-#if CV_AVX2
-        // nothing
-#elif CV_SSE2
-        if( USE_SSE2 )
-        {
-            for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
-            {
-                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
-                r = vop(r, VLoadStore64<T>::load(src2 + x));
-                VLoadStore64<T>::store(dst + x, r);
-            }
-        }
-#endif
+    char opts[1024];
+    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
+    int scalarcn = kercn == 3 ? 4 : kercn;
+    int rowsPerWI = d.isIntel() ? 4 : 1;
 
-#if CV_ENABLE_UNROLLED
-        for( ; x <= sz.width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
+    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
+            haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
+                ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
+                ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
+                ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
+            kercn, rowsPerWI);
 
-        for( ; x < sz.width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
+    if (k.empty())
+        return false;
 
-template<typename T, class Op, class Op32>
-void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, Size sz)
-{
-#if CV_SSE2 || CV_NEON
-    Op32 op32;
-#endif
-    Op op;
+    UMat src1 = _src1.getUMat(), src2;
+    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
+    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
+                                       ocl::KernelArg::WriteOnly(dst, cn, kercn);
+    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
 
-    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
+    if( haveScalar )
     {
-        int x = 0;
+        size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
+        double buf[4] = {0,0,0,0};
 
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= sz.width - 8; x += 8 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
+        if( oclop != OCL_OP_NOT )
         {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= sz.width - 8; x += 8 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
-                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
-                }
-            }
+            Mat src2sc = _src2.getMat();
+            convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
         }
-#endif // CV_AVX2
 
-#if CV_NEON || CV_SSE2
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            for( ; x <= sz.width - 8; x += 8 )
-            {
-                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
-                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
-                VLoadStore256<T>::store(dst + x, r0);
-            }
-        }
-#else
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-#endif // CV_SSE2
-            for( ; x <= sz.width - 8; x += 8 )
-            {
-                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
-                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
-                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
-                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
-                VLoadStore128<T>::store(dst + x    , r0);
-                VLoadStore128<T>::store(dst + x + 4, r1);
-            }
-#if CV_SSE2
-        }
-#endif // CV_SSE2
-#endif // CV_AVX2
-#endif // CV_NEON || CV_SSE2
+        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
 
-#if CV_ENABLE_UNROLLED
-        for( ; x <= sz.width - 4; x += 4 )
-        {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
-        }
-#endif
+        if( !haveMask )
+            k.args(src1arg, dstarg, scalararg);
+        else
+            k.args(src1arg, maskarg, dstarg, scalararg);
+    }
+    else
+    {
+        src2 = _src2.getUMat();
+        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
 
-        for( ; x < sz.width; x++ )
-            dst[x] = op(src1[x], src2[x]);
+        if( !haveMask )
+            k.args(src1arg, src2arg, dstarg);
+        else
+            k.args(src1arg, src2arg, maskarg, dstarg);
     }
+
+    size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI };
+    return k.run(2, globalsize, 0, false);
 }
 
+#endif
 
-template<typename T, class Op, class Op64>
-void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
-               T* dst, size_t step, Size sz)
+static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
+                       InputArray _mask, const BinaryFuncC* tab,
+                       bool bitwise, int oclop )
 {
-#if CV_SSE2
-    Op64 op64;
+    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
+    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
+    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
+    int dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
+    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+#ifdef HAVE_OPENCL
+    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
+            dims1 <= 2 && dims2 <= 2;
 #endif
-    Op op;
+    bool haveMask = !_mask.empty(), haveScalar = false;
+    BinaryFuncC func;
 
-    for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
-                        src2 = (const T *)((const uchar *)src2 + step2),
-                        dst = (T *)((uchar *)dst + step) )
+    if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
     {
-        int x = 0;
+        _dst.create(sz1, type1);
+        CV_OCL_RUN(use_opencl,
+                   ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
 
-#if CV_AVX2
-        if( USE_AVX2 )
-        {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
-            {
-                for( ; x <= sz.width - 4; x += 4 )
-                {
-                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
-                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
-                    VLoadStore256Aligned<T>::store(dst + x, r0);
-                }
-            }
-        }
-#elif CV_SSE2
-        if( USE_SSE2 )
+        if( bitwise )
         {
-            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
-            {
-                for( ; x <= sz.width - 4; x += 4 )
-                {
-                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
-                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
-                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
-                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
-                    VLoadStore128Aligned<T>::store(dst + x    , r0);
-                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
-                }
-            }
+            func = *tab;
+            cn = (int)CV_ELEM_SIZE(type1);
         }
-#endif
+        else
+            func = tab[depth1];
 
-        for( ; x <= sz.width - 4; x += 4 )
+        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
+        Size sz = getContinuousSize(src1, src2, dst);
+        size_t len = sz.width*(size_t)cn;
+        if( len == (size_t)(int)len )
         {
-            T v0 = op(src1[x], src2[x]);
-            T v1 = op(src1[x+1], src2[x+1]);
-            dst[x] = v0; dst[x+1] = v1;
-            v0 = op(src1[x+2], src2[x+2]);
-            v1 = op(src1[x+3], src2[x+3]);
-            dst[x+2] = v0; dst[x+3] = v1;
+            sz.width = (int)len;
+            func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, 0);
+            return;
         }
-
-        for( ; x < sz.width; x++ )
-            dst[x] = op(src1[x], src2[x]);
-    }
-}
-
-#if CV_AVX2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
-    }
-
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
-    template <>                                                                     \
-    struct name<template_arg>{                                                      \
-        typedef register_type reg_type;                                             \
-        static reg_type load(const template_arg * p) { return load_body (p); }      \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
-    }
-
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
     }
 
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore256<template_arg>::reg_type operator()(                      \
-                        const VLoadStore256<template_arg>::reg_type & a,       \
-                        const VLoadStore256<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
+    if( oclop == OCL_OP_NOT )
+        haveScalar = true;
+    else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
+        !psrc1->sameSize(*psrc2) || type1 != type2 )
+    {
+        if( checkScalar(*psrc1, type2, kind1, kind2) )
+        {
+            // src1 is a scalar; swap it with src2
+            swap(psrc1, psrc2);
+            swap(type1, type2);
+            swap(depth1, depth2);
+            swap(cn, cn2);
+            swap(sz1, sz2);
+        }
+        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
+            CV_Error( CV_StsUnmatchedSizes,
+                      "The operation is neither 'array op array' (where arrays have the same size and type), "
+                      "nor 'array op scalar', nor 'scalar op array'" );
+        haveScalar = true;
     }
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
-                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
-                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m256i d = _mm256_subs_epi8(a, b);
-        __m256i m = _mm256_cmpgt_epi8(b, a);
-        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m256i M = _mm256_max_epi16(a, b);
-        __m256i m = _mm256_min_epi16(a, b);
-        return _mm256_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m256i d = _mm256_sub_epi32(a, b);
-        __m256i m = _mm256_cmpgt_epi32(b, a);
-        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
-
-#elif CV_SSE2
-
-#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                                  \
-    struct name<template_arg>{                                                                   \
-        typedef register_type reg_type;                                                          \
-        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
-        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
+    else
+    {
+        CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
     }
 
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p); } \
-        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
-    }
+    size_t esz = CV_ELEM_SIZE(type1);
+    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
+    BinaryFunc copymask = 0;
+    bool reallocate = false;
 
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type & b) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
+    if( haveMask )
+    {
+        int mtype = _mask.type();
+        CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
+        copymask = getCopyMaskFunc(esz);
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
     }
 
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                                 \
-    struct name<template_arg>                                                  \
-    {                                                                          \
-        VLoadStore128<template_arg>::reg_type operator()(                      \
-                        const VLoadStore128<template_arg>::reg_type & a,       \
-                        const VLoadStore128<template_arg>::reg_type &  ) const \
-        {                                                                      \
-            body;                                                              \
-        }                                                                      \
-    }
+    AutoBuffer<uchar> _buf;
+    uchar *scbuf = 0, *maskbuf = 0;
 
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
-
-FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
-FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, schar,
-        __m128i m = _mm_cmpgt_epi8(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
-FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int,
-        __m128i m = _mm_cmpgt_epi32(a, b);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, schar,
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int,
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
-    );
-FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
-FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
-
-
-static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
-static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
-        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
-        __m128i d = _mm_subs_epi8(a, b);
-        __m128i m = _mm_cmpgt_epi8(b, a);
-        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
-        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
-        __m128i M = _mm_max_epi16(a, b);
-        __m128i m = _mm_min_epi16(a, b);
-        return _mm_subs_epi16(M, m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
-        __m128i d = _mm_sub_epi32(a, b);
-        __m128i m = _mm_cmpgt_epi32(b, a);
-        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
-        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
-    );
-FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
-        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
-    );
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
-#endif
+    _dst.createSameSize(*psrc1, type1);
+    // if this is mask operation and dst has been reallocated,
+    // we have to clear the destination
+    if( haveMask && reallocate )
+        _dst.setTo(0.);
 
-#if CV_NEON
+    CV_OCL_RUN(use_opencl,
+               ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
 
-#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
-    template <>                                                                \
-    struct name<template_arg>{                                                 \
-        typedef register_type reg_type;                                        \
-        static reg_type load(const template_arg * p) { return load_body (p);}; \
-        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
-    }
 
-#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type b) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
-    }
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
+    Mat dst = _dst.getMat(), mask = _mask.getMat();
 
-#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
-    template<>                                                         \
-    struct name<template_arg>                                          \
-    {                                                                  \
-        VLoadStore128<template_arg>::reg_type operator()(              \
-                        VLoadStore128<template_arg>::reg_type a,       \
-                        VLoadStore128<template_arg>::reg_type  ) const \
-        {                                                              \
-            return body;                                               \
-        };                                                             \
+    if( bitwise )
+    {
+        func = *tab;
+        cn = (int)esz;
     }
+    else
+        func = tab[depth1];
 
-FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
-FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
-FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
-FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
-FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
-FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
-
-FUNCTOR_TEMPLATE(VAdd);
-FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VSub);
-FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VMin);
-FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VMax);
-FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
-FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
-FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
-
-FUNCTOR_TEMPLATE(VAbsDiff);
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
-FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
-
-FUNCTOR_TEMPLATE(VAnd);
-FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
-FUNCTOR_TEMPLATE(VOr);
-FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
-FUNCTOR_TEMPLATE(VXor);
-FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
-FUNCTOR_TEMPLATE(VNot);
-FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
-#endif
+    if( !haveScalar )
+    {
+        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
+        uchar* ptrs[4];
 
-#if CV_SSE2 || CV_NEON
-#define IF_SIMD(op) op
-#else
-#define IF_SIMD(op) NOP
-#endif
+        NAryMatIterator it(arrays, ptrs);
+        size_t total = it.size, blocksize = total;
 
-template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
-{ return CV_FAST_CAST_8U(a + b); }
-template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
-{ return CV_FAST_CAST_8U(a - b); }
+        if( blocksize*cn > INT_MAX )
+            blocksize = INT_MAX/cn;
 
-template<typename T> struct OpAbsDiff
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()(T a, T b) const { return (T)std::abs(a - b); }
-};
-
-template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
-{ return saturate_cast<short>(std::abs(a - b)); }
-
-template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
-{ return saturate_cast<schar>(std::abs(a - b)); }
-
-template<typename T, typename WT=T> struct OpAbsDiffS
-{
-    typedef T type1;
-    typedef WT type2;
-    typedef T rtype;
-    T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
-};
-
-template<typename T> struct OpAnd
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a & b; }
-};
-
-template<typename T> struct OpOr
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a | b; }
-};
-
-template<typename T> struct OpXor
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T b ) const { return a ^ b; }
-};
-
-template<typename T> struct OpNot
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator()( T a, T ) const { return ~a; }
-};
-
-#if (ARITHM_USE_IPP == 1)
-static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
-{
-    if( sz.height == 1 )
-        step1 = step2 = step = sz.width*elemSize;
-}
-#endif
-
-static void add8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
+        if( haveMask )
         {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
+            blocksize = std::min(blocksize, blocksize0);
+            _buf.allocate(blocksize*esz);
+            maskbuf = _buf;
         }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
 
-static void add8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, Size sz, void* )
-{
-    vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
-}
-
-static void add16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
+        for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz));
-}
+            for( size_t j = 0; j < total; j += blocksize )
+            {
+                int bsz = (int)MIN(total - j, blocksize);
 
-static void add16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
+                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, bsz*cn, 1, 0 );
+                if( haveMask )
+                {
+                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
+                    ptrs[3] += bsz;
+                }
+
+                bsz *= (int)esz;
+                ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
+            }
         }
-        setIppErrorStatus();
     }
-#endif
-    (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz));
-}
+    else
+    {
+        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
+        uchar* ptrs[3];
 
-static void add32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* )
-{
-    vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
-}
+        NAryMatIterator it(arrays, ptrs);
+        size_t total = it.size, blocksize = std::min(total, blocksize0);
 
-static void add32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz));
-}
+        _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
+        scbuf = _buf;
+        maskbuf = alignPtr(scbuf + blocksize*esz, 16);
 
-static void add64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* )
-{
-    vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
-}
+        convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
 
-static void sub8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
+        for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
+            for( size_t j = 0; j < total; j += blocksize )
+            {
+                int bsz = (int)MIN(total - j, blocksize);
 
-static void sub8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, Size sz, void* )
-{
-    vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
-}
+                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, bsz*cn, 1, 0 );
+                if( haveMask )
+                {
+                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
+                    ptrs[2] += bsz;
+                }
 
-static void sub16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
+                bsz *= (int)esz;
+                ptrs[0] += bsz; ptrs[1] += bsz;
+            }
         }
-        setIppErrorStatus();
     }
-#endif
-    (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz));
 }
 
-static void sub16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* )
+static BinaryFuncC* getMaxTab()
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
+    static BinaryFuncC maxTab[] =
     {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz));
-}
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::max16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::max32f), (BinaryFuncC)cv::hal::max64f,
+        0
+    };
 
-static void sub32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* )
-{
-    vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
+    return maxTab;
 }
 
-static void sub32f( const float* src1, size_t step1,
-                   const float* src2, size_t step2,
-                   float* dst, size_t step, Size sz, void* )
+static BinaryFuncC* getMinTab()
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
+    static BinaryFuncC minTab[] =
     {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz));
-}
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::min16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::min32f), (BinaryFuncC)cv::hal::min64f,
+        0
+    };
 
-static void sub64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* )
-{
-    vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
+    return minTab;
 }
 
-template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
-template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
-
-static void max8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        uchar* s1 = (uchar*)src1;
-        uchar* s2 = (uchar*)src2;
-        uchar* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width))
-                break;
-            s1 += step1;
-            s2 += step2;
-            d  += step;
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
 }
 
-static void max8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, Size sz, void* )
+void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
-    vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
+    BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::and8u);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
 }
 
-static void max16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* )
+void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        ushort* s1 = (ushort*)src1;
-        ushort* s2 = (ushort*)src2;
-        ushort* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width))
-                break;
-            s1 = (ushort*)((uchar*)s1 + step1);
-            s2 = (ushort*)((uchar*)s2 + step2);
-            d  = (ushort*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
+    BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::or8u);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
 }
 
-static void max16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* )
+void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
-    vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
+    BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::xor8u);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
 }
 
-static void max32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* )
+void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
 {
-    vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
+    BinaryFuncC f = (BinaryFuncC)GET_OPTIMIZED(cv::hal::not8u);
+    binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
 }
 
-static void max32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, Size sz, void* )
+void cv::max( InputArray src1, InputArray src2, OutputArray dst )
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        float* s1 = (float*)src1;
-        float* s2 = (float*)src2;
-        float* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width))
-                break;
-            s1 = (float*)((uchar*)s1 + step1);
-            s2 = (float*)((uchar*)s2 + step2);
-            d  = (float*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
+    binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }
 
-static void max64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* )
+void cv::min( InputArray src1, InputArray src2, OutputArray dst )
 {
-#if ARITHM_USE_IPP == 1
-    CV_IPP_CHECK()
-    {
-        double* s1 = (double*)src1;
-        double* s2 = (double*)src2;
-        double* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
-                break;
-            s1 = (double*)((uchar*)s1 + step1);
-            s2 = (double*)((uchar*)s2 + step2);
-            d  = (double*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
+    binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }
 
-static void min8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
+void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        uchar* s1 = (uchar*)src1;
-        uchar* s2 = (uchar*)src2;
-        uchar* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMinEvery_8u(s1, s2, d, sz.width))
-                break;
-            s1 += step1;
-            s2 += step2;
-            d  += step;
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }
 
-static void min8s( const schar* src1, size_t step1,
-                   const schar* src2, size_t step2,
-                   schar* dst, size_t step, Size sz, void* )
+void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 {
-    vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }
 
-static void min16u( const ushort* src1, size_t step1,
-                    const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* )
+void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        ushort* s1 = (ushort*)src1;
-        ushort* s2 = (ushort*)src2;
-        ushort* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMinEvery_16u(s1, s2, d, sz.width))
-                break;
-            s1 = (ushort*)((uchar*)s1 + step1);
-            s2 = (ushort*)((uchar*)s2 + step2);
-            d  = (ushort*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }
 
-static void min16s( const short* src1, size_t step1,
-                    const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* )
+void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
 {
-    vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }
 
-static void min32s( const int* src1, size_t step1,
-                    const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* )
-{
-    vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
-}
 
-static void min32f( const float* src1, size_t step1,
-                    const float* src2, size_t step2,
-                    float* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        float* s1 = (float*)src1;
-        float* s2 = (float*)src2;
-        float* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMinEvery_32f(s1, s2, d, sz.width))
-                break;
-            s1 = (float*)((uchar*)s1 + step1);
-            s2 = (float*)((uchar*)s2 + step2);
-            d  = (float*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
-}
+/****************************************************************************************\
+*                                      add/subtract                                      *
+\****************************************************************************************/
 
-static void min64f( const double* src1, size_t step1,
-                    const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* )
+namespace cv
 {
-#if ARITHM_USE_IPP == 1
-    CV_IPP_CHECK()
-    {
-        double* s1 = (double*)src1;
-        double* s2 = (double*)src2;
-        double* d  = dst;
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        int i = 0;
-        for(; i < sz.height; i++)
-        {
-            if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
-                break;
-            s1 = (double*)((uchar*)s1 + step1);
-            s2 = (double*)((uchar*)s2 + step2);
-            d  = (double*)((uchar*)d + step);
-        }
-        if (i == sz.height)
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
-}
 
-static void absdiff8u( const uchar* src1, size_t step1,
-                       const uchar* src2, size_t step2,
-                       uchar* dst, size_t step, Size sz, void* )
+static int actualScalarDepth(const double* data, int len)
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
+    int i = 0, minval = INT_MAX, maxval = INT_MIN;
+    for(; i < len; ++i)
     {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
+        int ival = cvRound(data[i]);
+        if( ival != data[i] )
+            break;
+        minval = MIN(minval, ival);
+        maxval = MAX(maxval, ival);
     }
-#endif
-    (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz));
+    return i < len ? CV_64F :
+        minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
+        minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
+        minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
+        minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
+        CV_32S;
 }
 
-static void absdiff8s( const schar* src1, size_t step1,
-                       const schar* src2, size_t step2,
-                       schar* dst, size_t step, Size sz, void* )
-{
-    vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
-}
+#ifdef HAVE_OPENCL
 
-static void absdiff16u( const ushort* src1, size_t step1,
-                        const ushort* src2, size_t step2,
-                        ushort* dst, size_t step, Size sz, void* )
+static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, int wtype,
+                          void* usrdata, int oclop,
+                          bool haveScalar )
 {
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz));
-}
-
-static void absdiff16s( const short* src1, size_t step1,
-                        const short* src2, size_t step2,
-                        short* dst, size_t step, Size sz, void* )
-{
-    vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
-}
-
-static void absdiff32s( const int* src1, size_t step1,
-                        const int* src2, size_t step2,
-                        int* dst, size_t step, Size sz, void* )
-{
-    vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
-}
-
-static void absdiff32f( const float* src1, size_t step1,
-                        const float* src2, size_t step2,
-                        float* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz));
-}
-
-static void absdiff64f( const double* src1, size_t step1,
-                        const double* src2, size_t step2,
-                        double* dst, size_t step, Size sz, void* )
-{
-    vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
-}
-
+    const ocl::Device d = ocl::Device::getDefault();
+    bool doubleSupport = d.doubleFPConfig() > 0;
+    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    bool haveMask = !_mask.empty();
 
-static void and8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
+    if ( (haveMask || haveScalar) && cn > 4 )
+        return false;
 
-static void or8u( const uchar* src1, size_t step1,
-                  const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
+    if (!doubleSupport)
+        wdepth = std::min(wdepth, CV_32F);
 
-static void xor8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step);
-        if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
+    wtype = CV_MAKETYPE(wdepth, cn);
+    int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
+    if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
+        return false;
 
-static void not8u( const uchar* src1, size_t step1,
-                   const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* )
-{
-#if (ARITHM_USE_IPP == 1)
-    CV_IPP_CHECK()
-    {
-        fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2;
-        if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz)))
-        {
-            CV_IMPL_ADD(CV_IMPL_IPP);
-            return;
-        }
-        setIppErrorStatus();
-    }
-#endif
-    (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz));
-}
+    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
+    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
 
-/****************************************************************************************\
-*                                   logical operations                                   *
-\****************************************************************************************/
+    char cvtstr[4][32], opts[1024];
+    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
+            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
+            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
+            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
+            oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
+            ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+            ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+            ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+            ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
+            ocl::typeToStr(wdepth), wdepth,
+            ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
+            ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
+            ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
+            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
+            oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
+            ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
 
-void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
-{
-    int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
-    size_t esz = CV_ELEM_SIZE(buftype);
-    getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
-    // unroll the scalar
-    if( scn < cn )
+    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
+    const uchar* usrdata_p = (const uchar*)usrdata;
+    const double* usrdata_d = (const double*)usrdata;
+    float usrdata_f[3];
+    int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
+        oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
+    if( n > 0 && wdepth == CV_32F )
     {
-        CV_Assert( scn == 1 );
-        size_t esz1 = CV_ELEM_SIZE1(buftype);
-        for( size_t i = esz1; i < esz; i++ )
-            scbuf[i] = scbuf[i - esz1];
+        for( i = 0; i < n; i++ )
+            usrdata_f[i] = (float)usrdata_d[i];
+        usrdata_p = (const uchar*)usrdata_f;
     }
-    for( size_t i = esz; i < blocksize*esz; i++ )
-        scbuf[i] = scbuf[i - esz];
-}
-
-
-enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
-       OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
-       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
-       OCL_OP_RDIV_SCALE=15 };
-
-#ifdef HAVE_OPENCL
-
-static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
-    "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
-    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
-
-static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-                          InputArray _mask, bool bitwise, int oclop, bool haveScalar )
-{
-    bool haveMask = !_mask.empty();
-    int srctype = _src1.type();
-    int srcdepth = CV_MAT_DEPTH(srctype);
-    int cn = CV_MAT_CN(srctype);
-
-    const ocl::Device d = ocl::Device::getDefault();
-    bool doubleSupport = d.doubleFPConfig() > 0;
-    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
-            (!doubleSupport && srcdepth == CV_64F && !bitwise))
-        return false;
-
-    char opts[1024];
-    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
-    int scalarcn = kercn == 3 ? 4 : kercn;
-    int rowsPerWI = d.isIntel() ? 4 : 1;
-
-    sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
-            haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
-            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
-                ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
-            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
-                ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
-            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
-                ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
-            kercn, rowsPerWI);
 
     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
     if (k.empty())
@@ -1477,19 +524,24 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
 
     if( haveScalar )
     {
-        size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
-        double buf[4] = {0,0,0,0};
-
-        if( oclop != OCL_OP_NOT )
-        {
-            Mat src2sc = _src2.getMat();
-            convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
-        }
+        size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
+        double buf[4]={0,0,0,0};
+        Mat src2sc = _src2.getMat();
 
+        if( !src2sc.empty() )
+            convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
 
         if( !haveMask )
-            k.args(src1arg, dstarg, scalararg);
+        {
+            if(n == 0)
+                k.args(src1arg, dstarg, scalararg);
+            else if(n == 1)
+                k.args(src1arg, dstarg, scalararg,
+                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
+            else
+                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
+        }
         else
             k.args(src1arg, maskarg, dstarg, scalararg);
     }
@@ -1499,121 +551,176 @@ static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
 
         if( !haveMask )
-            k.args(src1arg, src2arg, dstarg);
+        {
+            if (n == 0)
+                k.args(src1arg, src2arg, dstarg);
+            else if (n == 1)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
+            else if (n == 3)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
+            else
+                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
+        }
         else
             k.args(src1arg, src2arg, maskarg, dstarg);
     }
 
     size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI };
-    return k.run(2, globalsize, 0, false);
+    return k.run(2, globalsize, NULL, false);
 }
 
 #endif
 
-static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
-                       InputArray _mask, const BinaryFunc* tab,
-                       bool bitwise, int oclop )
+static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                      InputArray _mask, int dtype, BinaryFuncC* tab, bool muldiv=false,
+                      void* usrdata=0, int oclop=-1 )
 {
     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
+    bool haveMask = !_mask.empty();
+    bool reallocate = false;
     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
-    int dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
 #ifdef HAVE_OPENCL
-    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
-            dims1 <= 2 && dims2 <= 2;
+    bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
 #endif
-    bool haveMask = !_mask.empty(), haveScalar = false;
-    BinaryFunc func;
+    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
+    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
 
-    if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
+    if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
+        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
+                       (_dst.fixedType() && _dst.type() == type1)) &&
+        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
     {
-        _dst.create(sz1, type1);
+        _dst.createSameSize(*psrc1, type1);
         CV_OCL_RUN(use_opencl,
-                   ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
-
-        if( bitwise )
-        {
-            func = *tab;
-            cn = (int)CV_ELEM_SIZE(type1);
-        }
-        else
-            func = tab[depth1];
+            ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
+                          (!usrdata ? type1 : std::max(depth1, CV_32F)),
+                          usrdata, oclop, false))
 
         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
-        Size sz = getContinuousSize(src1, src2, dst);
-        size_t len = sz.width*(size_t)cn;
-        if( len == (size_t)(int)len )
-        {
-            sz.width = (int)len;
-            func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0);
-            return;
-        }
+        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
+        tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, usrdata);
+        return;
     }
 
-    if( oclop == OCL_OP_NOT )
-        haveScalar = true;
-    else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
-        !psrc1->sameSize(*psrc2) || type1 != type2 )
+    bool haveScalar = false, swapped12 = false;
+
+    if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
+        (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
+        (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
     {
         if( checkScalar(*psrc1, type2, kind1, kind2) )
         {
             // src1 is a scalar; swap it with src2
             swap(psrc1, psrc2);
+            swap(sz1, sz2);
             swap(type1, type2);
             swap(depth1, depth2);
             swap(cn, cn2);
-            swap(sz1, sz2);
+            swap(dims1, dims2);
+            swapped12 = true;
+            if( oclop == OCL_OP_SUB )
+                oclop = OCL_OP_RSUB;
+            if ( oclop == OCL_OP_DIV_SCALE )
+                oclop = OCL_OP_RDIV_SCALE;
         }
         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
             CV_Error( CV_StsUnmatchedSizes,
-                      "The operation is neither 'array op array' (where arrays have the same size and type), "
-                      "nor 'array op scalar', nor 'scalar op array'" );
+                     "The operation is neither 'array op array' "
+                     "(where arrays have the same size and the same number of channels), "
+                     "nor 'array op scalar', nor 'scalar op array'" );
         haveScalar = true;
+        CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
+
+        if (!muldiv)
+        {
+            Mat sc = psrc2->getMat();
+            depth2 = actualScalarDepth(sc.ptr<double>(), cn);
+            if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
+                depth2 = CV_32F;
+        }
+        else
+            depth2 = CV_64F;
+    }
+
+    if( dtype < 0 )
+    {
+        if( _dst.fixedType() )
+            dtype = _dst.type();
+        else
+        {
+            if( !haveScalar && type1 != type2 )
+                CV_Error(CV_StsBadArg,
+                     "When the input arrays in add/subtract/multiply/divide functions have different types, "
+                     "the output array type must be explicitly specified");
+            dtype = type1;
+        }
+    }
+    dtype = CV_MAT_DEPTH(dtype);
+
+    if( depth1 == depth2 && dtype == depth1 )
+        wtype = dtype;
+    else if( !muldiv )
+    {
+        wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
+                depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
+        wtype = std::max(wtype, dtype);
+
+        // when the result of addition should be converted to an integer type,
+        // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
+        // instead of converting the other input to floating-point and then converting the operation result back to integers.
+        if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
+            wtype = CV_32S;
     }
     else
     {
-        CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
+        wtype = std::max(depth1, std::max(depth2, CV_32F));
+        wtype = std::max(wtype, dtype);
     }
 
-    size_t esz = CV_ELEM_SIZE(type1);
-    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
-    BinaryFunc copymask = 0;
-    bool reallocate = false;
+    dtype = CV_MAKETYPE(dtype, cn);
+    wtype = CV_MAKETYPE(wtype, cn);
 
     if( haveMask )
     {
         int mtype = _mask.type();
-        CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
-        copymask = getCopyMaskFunc(esz);
-        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
+        CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
     }
 
-    AutoBuffer<uchar> _buf;
-    uchar *scbuf = 0, *maskbuf = 0;
-
-    _dst.createSameSize(*psrc1, type1);
-    // if this is mask operation and dst has been reallocated,
-    // we have to clear the destination
-    if( haveMask && reallocate )
+    _dst.createSameSize(*psrc1, dtype);
+    if( reallocate )
         _dst.setTo(0.);
 
     CV_OCL_RUN(use_opencl,
-               ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
+               ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
+               usrdata, oclop, haveScalar))
 
+    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
+    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
+    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
 
-    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
-    Mat dst = _dst.getMat(), mask = _mask.getMat();
+    size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
+    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
+    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
+    BinaryFunc copymask = getCopyMaskFunc(dsz);
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
 
-    if( bitwise )
-    {
-        func = *tab;
-        cn = (int)esz;
-    }
-    else
-        func = tab[depth1];
+    AutoBuffer<uchar> _buf;
+    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
+    size_t bufesz = (cvtsrc1 ? wsz : 0) +
+                    (cvtsrc2 || haveScalar ? wsz : 0) +
+                    (cvtdst ? wsz : 0) +
+                    (haveMask ? dsz : 0);
+    BinaryFuncC func = tab[CV_MAT_DEPTH(wtype)];
 
     if( !haveScalar )
     {
@@ -1623,31 +730,62 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = total;
 
-        if( blocksize*cn > INT_MAX )
-            blocksize = INT_MAX/cn;
+        if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
+            blocksize = std::min(blocksize, blocksize0);
 
+        _buf.allocate(bufesz*blocksize + 64);
+        buf = _buf;
+        if( cvtsrc1 )
+            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        if( cvtsrc2 )
+            buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        wbuf = maskbuf = buf;
+        if( cvtdst )
+            buf = alignPtr(buf + blocksize*wsz, 16);
         if( haveMask )
-        {
-            blocksize = std::min(blocksize, blocksize0);
-            _buf.allocate(blocksize*esz);
-            maskbuf = _buf;
-        }
+            maskbuf = buf;
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
         {
             for( size_t j = 0; j < total; j += blocksize )
             {
                 int bsz = (int)MIN(total - j, blocksize);
-
-                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
-                if( haveMask )
+                Size bszn(bsz*cn, 1);
+                const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
+                uchar* dptr = ptrs[2];
+                if( cvtsrc1 )
                 {
-                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
-                    ptrs[3] += bsz;
+                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                    sptr1 = buf1;
+                }
+                if( ptrs[0] == ptrs[1] )
+                    sptr2 = sptr1;
+                else if( cvtsrc2 )
+                {
+                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
+                    sptr2 = buf2;
                 }
 
-                bsz *= (int)esz;
-                ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
+                if( !haveMask && !cvtdst )
+                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
+                else
+                {
+                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn.width, bszn.height, usrdata );
+                    if( !haveMask )
+                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
+                    else if( !cvtdst )
+                    {
+                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
+                        ptrs[3] += bsz;
+                    }
+                    else
+                    {
+                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
+                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
+                        ptrs[3] += bsz;
+                    }
+                }
+                ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
             }
         }
     }
@@ -1659,3213 +797,285 @@ static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
         NAryMatIterator it(arrays, ptrs);
         size_t total = it.size, blocksize = std::min(total, blocksize0);
 
-        _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
-        scbuf = _buf;
-        maskbuf = alignPtr(scbuf + blocksize*esz, 16);
-
-        convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
-
-        for( size_t i = 0; i < it.nplanes; i++, ++it )
-        {
-            for( size_t j = 0; j < total; j += blocksize )
-            {
-                int bsz = (int)MIN(total - j, blocksize);
-
-                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
-                if( haveMask )
-                {
-                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
-                    ptrs[2] += bsz;
-                }
-
-                bsz *= (int)esz;
-                ptrs[0] += bsz; ptrs[1] += bsz;
-            }
-        }
-    }
-}
-
-static BinaryFunc* getMaxTab()
-{
-    static BinaryFunc maxTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
-        (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
-        (BinaryFunc)GET_OPTIMIZED(max32s),
-        (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
-        0
-    };
-
-    return maxTab;
-}
-
-static BinaryFunc* getMinTab()
-{
-    static BinaryFunc minTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
-        (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
-        (BinaryFunc)GET_OPTIMIZED(min32s),
-        (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
-        0
-    };
-
-    return minTab;
-}
-
-}
-
-void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
-{
-    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
-    binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
-}
-
-void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
-{
-    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
-    binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
-}
-
-void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
-{
-    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
-    binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
-}
-
-void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
-{
-    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
-    binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
-}
-
-void cv::max( InputArray src1, InputArray src2, OutputArray dst )
-{
-    binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
-}
-
-void cv::min( InputArray src1, InputArray src2, OutputArray dst )
-{
-    binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
-}
-
-void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
-{
-    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
-}
-
-void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
-{
-    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
-}
-
-void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
-{
-    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
-}
-
-void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
-{
-    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
-}
-
-
-/****************************************************************************************\
-*                                      add/subtract                                      *
-\****************************************************************************************/
-
-namespace cv
-{
-
-static int actualScalarDepth(const double* data, int len)
-{
-    int i = 0, minval = INT_MAX, maxval = INT_MIN;
-    for(; i < len; ++i)
-    {
-        int ival = cvRound(data[i]);
-        if( ival != data[i] )
-            break;
-        minval = MIN(minval, ival);
-        maxval = MAX(maxval, ival);
-    }
-    return i < len ? CV_64F :
-        minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
-        minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
-        minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
-        minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
-        CV_32S;
-}
-
-#ifdef HAVE_OPENCL
-
-static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-                          InputArray _mask, int wtype,
-                          void* usrdata, int oclop,
-                          bool haveScalar )
-{
-    const ocl::Device d = ocl::Device::getDefault();
-    bool doubleSupport = d.doubleFPConfig() > 0;
-    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
-    bool haveMask = !_mask.empty();
-
-    if ( (haveMask || haveScalar) && cn > 4 )
-        return false;
-
-    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
-    if (!doubleSupport)
-        wdepth = std::min(wdepth, CV_32F);
-
-    wtype = CV_MAKETYPE(wdepth, cn);
-    int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
-    if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
-        return false;
-
-    int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
-    int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
-
-    char cvtstr[4][32], opts[1024];
-    sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
-            "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
-            "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
-            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
-            oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
-            ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
-            ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
-            ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
-            ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
-            ocl::typeToStr(wdepth), wdepth,
-            ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
-            ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
-            ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
-            doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
-            oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
-            ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
-
-    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
-    const uchar* usrdata_p = (const uchar*)usrdata;
-    const double* usrdata_d = (const double*)usrdata;
-    float usrdata_f[3];
-    int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
-        oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
-    if( n > 0 && wdepth == CV_32F )
-    {
-        for( i = 0; i < n; i++ )
-            usrdata_f[i] = (float)usrdata_d[i];
-        usrdata_p = (const uchar*)usrdata_f;
-    }
-
-    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
-    if (k.empty())
-        return false;
-
-    UMat src1 = _src1.getUMat(), src2;
-    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
-
-    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
-    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
-                                       ocl::KernelArg::WriteOnly(dst, cn, kercn);
-    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
-
-    if( haveScalar )
-    {
-        size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
-        double buf[4]={0,0,0,0};
-        Mat src2sc = _src2.getMat();
-
-        if( !src2sc.empty() )
-            convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
-        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
-
-        if( !haveMask )
-        {
-            if(n == 0)
-                k.args(src1arg, dstarg, scalararg);
-            else if(n == 1)
-                k.args(src1arg, dstarg, scalararg,
-                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
-            else
-                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
-        }
-        else
-            k.args(src1arg, maskarg, dstarg, scalararg);
-    }
-    else
-    {
-        src2 = _src2.getUMat();
-        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
-
-        if( !haveMask )
-        {
-            if (n == 0)
-                k.args(src1arg, src2arg, dstarg);
-            else if (n == 1)
-                k.args(src1arg, src2arg, dstarg,
-                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
-            else if (n == 3)
-                k.args(src1arg, src2arg, dstarg,
-                       ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
-                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
-                       ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
-            else
-                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
-        }
-        else
-            k.args(src1arg, src2arg, maskarg, dstarg);
-    }
-
-    size_t globalsize[] = { (size_t)src1.cols * cn / kercn, ((size_t)src1.rows + rowsPerWI - 1) / rowsPerWI };
-    return k.run(2, globalsize, NULL, false);
-}
-
-#endif
-
-static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-                      InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
-                      void* usrdata=0, int oclop=-1 )
-{
-    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
-    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
-    bool haveMask = !_mask.empty();
-    bool reallocate = false;
-    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
-    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
-    int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
-    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
-    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
-#ifdef HAVE_OPENCL
-    bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
-#endif
-    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
-    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
-
-    if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
-        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
-                       (_dst.fixedType() && _dst.type() == type1)) &&
-        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
-    {
-        _dst.createSameSize(*psrc1, type1);
-        CV_OCL_RUN(use_opencl,
-            ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
-                          (!usrdata ? type1 : std::max(depth1, CV_32F)),
-                          usrdata, oclop, false))
-
-        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
-        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
-        tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata);
-        return;
-    }
-
-    bool haveScalar = false, swapped12 = false;
-
-    if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
-        (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
-        (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
-    {
-        if( checkScalar(*psrc1, type2, kind1, kind2) )
-        {
-            // src1 is a scalar; swap it with src2
-            swap(psrc1, psrc2);
-            swap(sz1, sz2);
-            swap(type1, type2);
-            swap(depth1, depth2);
-            swap(cn, cn2);
-            swap(dims1, dims2);
-            swapped12 = true;
-            if( oclop == OCL_OP_SUB )
-                oclop = OCL_OP_RSUB;
-            if ( oclop == OCL_OP_DIV_SCALE )
-                oclop = OCL_OP_RDIV_SCALE;
-        }
-        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
-            CV_Error( CV_StsUnmatchedSizes,
-                     "The operation is neither 'array op array' "
-                     "(where arrays have the same size and the same number of channels), "
-                     "nor 'array op scalar', nor 'scalar op array'" );
-        haveScalar = true;
-        CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
-
-        if (!muldiv)
-        {
-            Mat sc = psrc2->getMat();
-            depth2 = actualScalarDepth(sc.ptr<double>(), cn);
-            if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
-                depth2 = CV_32F;
-        }
-        else
-            depth2 = CV_64F;
-    }
-
-    if( dtype < 0 )
-    {
-        if( _dst.fixedType() )
-            dtype = _dst.type();
-        else
-        {
-            if( !haveScalar && type1 != type2 )
-                CV_Error(CV_StsBadArg,
-                     "When the input arrays in add/subtract/multiply/divide functions have different types, "
-                     "the output array type must be explicitly specified");
-            dtype = type1;
-        }
-    }
-    dtype = CV_MAT_DEPTH(dtype);
-
-    if( depth1 == depth2 && dtype == depth1 )
-        wtype = dtype;
-    else if( !muldiv )
-    {
-        wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
-                depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
-        wtype = std::max(wtype, dtype);
-
-        // when the result of addition should be converted to an integer type,
-        // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
-        // instead of converting the other input to floating-point and then converting the operation result back to integers.
-        if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
-            wtype = CV_32S;
-    }
-    else
-    {
-        wtype = std::max(depth1, std::max(depth2, CV_32F));
-        wtype = std::max(wtype, dtype);
-    }
-
-    dtype = CV_MAKETYPE(dtype, cn);
-    wtype = CV_MAKETYPE(wtype, cn);
-
-    if( haveMask )
-    {
-        int mtype = _mask.type();
-        CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
-        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
-    }
-
-    _dst.createSameSize(*psrc1, dtype);
-    if( reallocate )
-        _dst.setTo(0.);
-
-    CV_OCL_RUN(use_opencl,
-               ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
-               usrdata, oclop, haveScalar))
-
-    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
-    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
-    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
-
-    size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
-    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
-    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
-    BinaryFunc copymask = getCopyMaskFunc(dsz);
-    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
-
-    AutoBuffer<uchar> _buf;
-    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
-    size_t bufesz = (cvtsrc1 ? wsz : 0) +
-                    (cvtsrc2 || haveScalar ? wsz : 0) +
-                    (cvtdst ? wsz : 0) +
-                    (haveMask ? dsz : 0);
-    BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
-
-    if( !haveScalar )
-    {
-        const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
-        uchar* ptrs[4];
-
-        NAryMatIterator it(arrays, ptrs);
-        size_t total = it.size, blocksize = total;
-
-        if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
-            blocksize = std::min(blocksize, blocksize0);
-
-        _buf.allocate(bufesz*blocksize + 64);
-        buf = _buf;
-        if( cvtsrc1 )
-            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
-        if( cvtsrc2 )
-            buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
-        wbuf = maskbuf = buf;
-        if( cvtdst )
-            buf = alignPtr(buf + blocksize*wsz, 16);
-        if( haveMask )
-            maskbuf = buf;
-
-        for( size_t i = 0; i < it.nplanes; i++, ++it )
-        {
-            for( size_t j = 0; j < total; j += blocksize )
-            {
-                int bsz = (int)MIN(total - j, blocksize);
-                Size bszn(bsz*cn, 1);
-                const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
-                uchar* dptr = ptrs[2];
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-                if( ptrs[0] == ptrs[1] )
-                    sptr2 = sptr1;
-                else if( cvtsrc2 )
-                {
-                    cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
-                    sptr2 = buf2;
-                }
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
-                else
-                {
-                    func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
-                    {
-                        copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
-                    }
-                    else
-                    {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[3] += bsz;
-                    }
-                }
-                ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
-            }
-        }
-    }
-    else
-    {
-        const Mat* arrays[] = { &src1, &dst, &mask, 0 };
-        uchar* ptrs[3];
-
-        NAryMatIterator it(arrays, ptrs);
-        size_t total = it.size, blocksize = std::min(total, blocksize0);
-
-        _buf.allocate(bufesz*blocksize + 64);
-        buf = _buf;
-        if( cvtsrc1 )
-            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
-        buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
-        wbuf = maskbuf = buf;
-        if( cvtdst )
-            buf = alignPtr(buf + blocksize*wsz, 16);
-        if( haveMask )
-            maskbuf = buf;
-
-        convertAndUnrollScalar( src2, wtype, buf2, blocksize);
-
-        for( size_t i = 0; i < it.nplanes; i++, ++it )
-        {
-            for( size_t j = 0; j < total; j += blocksize )
-            {
-                int bsz = (int)MIN(total - j, blocksize);
-                Size bszn(bsz*cn, 1);
-                const uchar *sptr1 = ptrs[0];
-                const uchar* sptr2 = buf2;
-                uchar* dptr = ptrs[1];
-
-                if( cvtsrc1 )
-                {
-                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
-                    sptr1 = buf1;
-                }
-
-                if( swapped12 )
-                    std::swap(sptr1, sptr2);
-
-                if( !haveMask && !cvtdst )
-                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
-                else
-                {
-                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata );
-                    if( !haveMask )
-                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
-                    else if( !cvtdst )
-                    {
-                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
-                    }
-                    else
-                    {
-                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
-                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
-                        ptrs[2] += bsz;
-                    }
-                }
-                ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
-            }
-        }
-    }
-}
-
-static BinaryFunc* getAddTab()
-{
-    static BinaryFunc addTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
-        (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
-        (BinaryFunc)GET_OPTIMIZED(add32s),
-        (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
-        0
-    };
-
-    return addTab;
-}
-
-static BinaryFunc* getSubTab()
-{
-    static BinaryFunc subTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
-        (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
-        (BinaryFunc)GET_OPTIMIZED(sub32s),
-        (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
-        0
-    };
-
-    return subTab;
-}
-
-static BinaryFunc* getAbsDiffTab()
-{
-    static BinaryFunc absDiffTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
-        (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
-        (BinaryFunc)GET_OPTIMIZED(absdiff32s),
-        (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
-        0
-    };
-
-    return absDiffTab;
-}
-
-}
-
-void cv::add( InputArray src1, InputArray src2, OutputArray dst,
-          InputArray mask, int dtype )
-{
-    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
-}
-
-void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray mask, int dtype )
-{
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (tegra::useTegra())
-    {
-        int kind1 = _src1.kind(), kind2 = _src2.kind();
-        Mat src1 = _src1.getMat(), src2 = _src2.getMat();
-        bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
-        bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
-
-        if (!src1Scalar && !src2Scalar &&
-            src1.depth() == CV_8U && src2.type() == src1.type() &&
-            src1.dims == 2 && src2.size() == src1.size() &&
-            mask.empty())
-        {
-            if (dtype < 0)
-            {
-                if (_dst.fixedType())
-                {
-                    dtype = _dst.depth();
-                }
-                else
-                {
-                    dtype = src1.depth();
-                }
-            }
-
-            dtype = CV_MAT_DEPTH(dtype);
-
-            if (!_dst.fixedType() || dtype == _dst.depth())
-            {
-                _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
-
-                if (dtype == CV_16S)
-                {
-                    Mat dst = _dst.getMat();
-                    if(tegra::subtract_8u8u16s(src1, src2, dst))
-                        return;
-                }
-                else if (dtype == CV_32F)
-                {
-                    Mat dst = _dst.getMat();
-                    if(tegra::subtract_8u8u32f(src1, src2, dst))
-                        return;
-                }
-                else if (dtype == CV_8S)
-                {
-                    Mat dst = _dst.getMat();
-                    if(tegra::subtract_8u8u8s(src1, src2, dst))
-                        return;
-                }
-            }
-        }
-    }
-#endif
-    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
-}
-
-void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
-{
-    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
-}
-
-/****************************************************************************************\
-*                                    multiply/divide                                     *
-\****************************************************************************************/
-
-namespace cv
-{
-
-template <typename T, typename WT>
-struct Mul_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Mul_SIMD<uchar, float>
-{
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
-                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1_u8(dst + x, vqmovn_u16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
-                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1_s8(dst + x, vqmovn_s16(v_dst));
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
-                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
-                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
-                vst1q_u16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
-                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
-                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
-                vst1q_s16(dst + x, v_dst);
-            }
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Mul_SIMD<float, float>
-{
-    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        else
-        {
-            float32x4_t v_scale = vdupq_n_f32(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                v_dst1 = vmulq_f32(v_dst1, v_scale);
-
-                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                v_dst2 = vmulq_f32(v_dst2, v_scale);
-
-                vst1q_f32(dst + x, v_dst1);
-                vst1q_f32(dst + x + 4, v_dst2);
-            }
-        }
-
-        return x;
-    }
-};
-
-#elif CV_SSE2
-
-#if CV_SSE4_1
-
-template <>
-struct Mul_SIMD<ushort, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
-                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template <>
-struct Mul_SIMD<schar, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale == 1.0f )
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        else
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
-
-                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-template <>
-struct Mul_SIMD<short, float>
-{
-    Mul_SIMD()
-    {
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-
-        if( scale != 1.0f )
-        {
-            __m128 v_scale = _mm_set1_ps(scale);
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
-                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
-
-                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
-                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
-
-                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
-                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
-                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
-
-                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
-                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
-            }
-        }
-
-        return x;
-    }
-
-    bool haveSSE;
-};
-
-#endif
-
-template<typename T, typename WT> static void
-mul_( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, Size size, WT scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Mul_SIMD<T, WT> vop;
-
-    if( scale == (WT)1. )
-    {
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, size.width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= size.width - 4; i += 4 )
-            {
-                T t0;
-                T t1;
-                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
-                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
-                dst[i  ] = t0;
-                dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
-                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
-                dst[i+2] = t0;
-                dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < size.width; i++ )
-                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
-        }
-    }
-    else
-    {
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int i = vop(src1, src2, dst, size.width, scale);
-            #if CV_ENABLE_UNROLLED
-            for(; i <= size.width - 4; i += 4 )
-            {
-                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
-                dst[i] = t0; dst[i+1] = t1;
-
-                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
-                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
-                dst[i+2] = t0; dst[i+3] = t1;
-            }
-            #endif
-            for( ; i < size.width; i++ )
-                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
-        }
-    }
-}
-
-template <typename T>
-struct Div_SIMD
-{
-    int operator() (const T *, const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-template <typename T>
-struct Recip_SIMD
-{
-    int operator() (const T *, T *, int, double) const
-    {
-        return 0;
-    }
-};
-
-
-#if CV_SIMD128
-
-template <>
-struct Div_SIMD<uchar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load_expand(src1 + x);
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<schar>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load_expand(src1 + x);
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<ushort>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src1 = v_load(src1 + x);
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
-            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<short>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src1 = v_load(src1 + x);
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1, t2, t3;
-            v_expand(v_src1, t0, t1);
-            v_expand(v_src2, t2, t3);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Div_SIMD<int>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src1 + x);
-            v_int32x4 t1 = v_load(src1 + x + 4);
-            v_int32x4 t2 = v_load(src2 + x);
-            v_int32x4 t3 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-            v_float32x4 f2 = v_cvt_f32(t2);
-            v_float32x4 f3 = v_cvt_f32(t3);
-
-            f0 = f0 * v_scale / f2;
-            f1 = f1 * v_scale / f3;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t2 == v_zero, v_zero, res0);
-            res1 = v_select(t3 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Div_SIMD<float>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_float32x4 v_zero = v_setzero_f32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src1 + x);
-            v_float32x4 f1 = v_load(src1 + x + 4);
-            v_float32x4 f2 = v_load(src2 + x);
-            v_float32x4 f3 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = f0 * v_scale / f2;
-            v_float32x4 res1 = f1 * v_scale / f3;
-
-            res0 = v_select(f2 == v_zero, v_zero, res0);
-            res1 = v_select(f3 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-///////////////////////// RECIPROCAL //////////////////////
-
-template <>
-struct Recip_SIMD<uchar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<schar>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const schar * src2, schar * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load_expand(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_pack_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<ushort>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_uint16x8 v_zero = v_setzero_u16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_uint16x8 v_src2 = v_load(src2 + x);
-
-            v_uint32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
-            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_uint16x8 res = v_pack_u(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<short>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const short * src2, short * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int16x8 v_zero = v_setzero_s16();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int16x8 v_src2 = v_load(src2 + x);
-
-            v_int32x4 t0, t1;
-            v_expand(v_src2, t0, t1);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
-            v_int16x8 res = v_pack(i0, i1);
-
-            res = v_select(v_src2 == v_zero, v_zero, res);
-            v_store(dst + x, res);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<int>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const int * src2, int * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_int32x4 v_zero = v_setzero_s32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_int32x4 t0 = v_load(src2 + x);
-            v_int32x4 t1 = v_load(src2 + x + 4);
-
-            v_float32x4 f0 = v_cvt_f32(t0);
-            v_float32x4 f1 = v_cvt_f32(t1);
-
-            f0 = v_scale / f0;
-            f1 = v_scale / f1;
-
-            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
-
-            res0 = v_select(t0 == v_zero, v_zero, res0);
-            res1 = v_select(t1 == v_zero, v_zero, res1);
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-
-template <>
-struct Recip_SIMD<float>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const float * src2, float * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float32x4 v_scale = v_setall_f32((float)scale);
-        v_float32x4 v_zero = v_setzero_f32();
-
-        for ( ; x <= width - 8; x += 8)
-        {
-            v_float32x4 f0 = v_load(src2 + x);
-            v_float32x4 f1 = v_load(src2 + x + 4);
-
-            v_float32x4 res0 = v_scale / f0;
-            v_float32x4 res1 = v_scale / f1;
-
-            res0 = v_select(f0 == v_zero, v_zero, res0);
-            res1 = v_select(f1 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 4, res1);
-        }
-
-        return x;
-    }
-};
-
-#if CV_SIMD128_64F
-
-template <>
-struct Div_SIMD<double>
-{
-    bool haveSIMD;
-    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-        v_float64x2 v_zero = v_setzero_f64();
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src1 + x);
-            v_float64x2 f1 = v_load(src1 + x + 2);
-            v_float64x2 f2 = v_load(src2 + x);
-            v_float64x2 f3 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = f0 * v_scale / f2;
-            v_float64x2 res1 = f1 * v_scale / f3;
-
-            res0 = v_select(f0 == v_zero, v_zero, res0);
-            res1 = v_select(f1 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct Recip_SIMD<double>
-{
-    bool haveSIMD;
-    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
-
-    int operator() (const double * src2, double * dst, int width, double scale) const
-    {
-        int x = 0;
-
-        if (!haveSIMD)
-            return x;
-
-        v_float64x2 v_scale = v_setall_f64(scale);
-        v_float64x2 v_zero = v_setzero_f64();
-
-        for ( ; x <= width - 4; x += 4)
-        {
-            v_float64x2 f0 = v_load(src2 + x);
-            v_float64x2 f1 = v_load(src2 + x + 2);
-
-            v_float64x2 res0 = v_scale / f0;
-            v_float64x2 res1 = v_scale / f1;
-
-            res0 = v_select(f0 == v_zero, v_zero, res0);
-            res1 = v_select(f1 == v_zero, v_zero, res1);
-
-            v_store(dst + x, res0);
-            v_store(dst + x + 2, res1);
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-#endif
-
-template<typename T> static void
-div_i( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, Size size, double scale )
-{
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, size.width, scale);
-        for( ; i < size.width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-div_f( const T* src1, size_t step1, const T* src2, size_t step2,
-      T* dst, size_t step, Size size, double scale )
-{
-    T scale_f = (T)scale;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Div_SIMD<T> vop;
-
-    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int i = vop(src1, src2, dst, size.width, scale);
-        for( ; i < size.width; i++ )
-        {
-            T num = src1[i], denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_i( const T*, size_t, const T* src2, size_t step2,
-         T* dst, size_t step, Size size, double scale )
-{
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-    float scale_f = (float)scale;
-
-    for( ; size.height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, size.width, scale);
-        for( ; i < size.width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-template<typename T> static void
-recip_f( const T*, size_t, const T* src2, size_t step2,
-         T* dst, size_t step, Size size, double scale )
-{
-    T scale_f = (T)scale;
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    Recip_SIMD<T> vop;
-
-    for( ; size.height--; src2 += step2, dst += step )
-    {
-        int i = vop(src2, dst, size.width, scale);
-        for( ; i < size.width; i++ )
-        {
-            T denom = src2[i];
-            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
-        }
-    }
-}
-
-
-static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* scale)
-{
-    float fscale = (float)*(const double*)scale;
-#if defined HAVE_IPP
-    CV_IPP_CHECK()
-    {
-        if (std::fabs(fscale - 1) <= FLT_EPSILON)
-        {
-            if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
-}
-
-static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                   schar* dst, size_t step, Size sz, void* scale)
-{
-    mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
-}
-
-static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* scale)
-{
-    float fscale = (float)*(const double*)scale;
-#if defined HAVE_IPP
-    CV_IPP_CHECK()
-    {
-        if (std::fabs(fscale - 1) <= FLT_EPSILON)
-        {
-            if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
-}
-
-static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* scale)
-{
-    float fscale = (float)*(const double*)scale;
-#if defined HAVE_IPP
-    CV_IPP_CHECK()
-    {
-        if (std::fabs(fscale - 1) <= FLT_EPSILON)
-        {
-            if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
-}
-
-static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* scale)
-{
-    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, Size sz, void* scale)
-{
-    float fscale = (float)*(const double*)scale;
-#if defined HAVE_IPP
-    CV_IPP_CHECK()
-    {
-        if (std::fabs(fscale - 1) <= FLT_EPSILON)
-        {
-            if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    mul_(src1, step1, src2, step2, dst, step, sz, fscale);
-}
-
-static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* scale)
-{
-    mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                   uchar* dst, size_t step, Size sz, void* scale)
-{
-    if( src1 )
-        div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-    else
-        recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  schar* dst, size_t step, Size sz, void* scale)
-{
-    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                    ushort* dst, size_t step, Size sz, void* scale)
-{
-    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                    short* dst, size_t step, Size sz, void* scale)
-{
-    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                    int* dst, size_t step, Size sz, void* scale)
-{
-    div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                    float* dst, size_t step, Size sz, void* scale)
-{
-    div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                    double* dst, size_t step, Size sz, void* scale)
-{
-    div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, Size sz, void* scale)
-{
-    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  schar* dst, size_t step, Size sz, void* scale)
-{
-    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                   ushort* dst, size_t step, Size sz, void* scale)
-{
-    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                   short* dst, size_t step, Size sz, void* scale)
-{
-    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                   int* dst, size_t step, Size sz, void* scale)
-{
-    recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                   float* dst, size_t step, Size sz, void* scale)
-{
-    recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                   double* dst, size_t step, Size sz, void* scale)
-{
-    recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
-}
-
-
-static BinaryFunc* getMulTab()
-{
-    static BinaryFunc mulTab[] =
-    {
-        (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
-        (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
-        (BinaryFunc)mul64f, 0
-    };
-
-    return mulTab;
-}
-
-static BinaryFunc* getDivTab()
-{
-    static BinaryFunc divTab[] =
-    {
-        (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
-        (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
-        (BinaryFunc)div64f, 0
-    };
-
-    return divTab;
-}
-
-static BinaryFunc* getRecipTab()
-{
-    static BinaryFunc recipTab[] =
-    {
-        (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
-        (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
-        (BinaryFunc)recip64f, 0
-    };
-
-    return recipTab;
-}
-
-}
-
-void cv::multiply(InputArray src1, InputArray src2,
-                  OutputArray dst, double scale, int dtype)
-{
-    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
-              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
-}
-
-void cv::divide(InputArray src1, InputArray src2,
-                OutputArray dst, double scale, int dtype)
-{
-    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
-}
-
-void cv::divide(double scale, InputArray src2,
-                OutputArray dst, int dtype)
-{
-    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
-}
-
-/****************************************************************************************\
-*                                      addWeighted                                       *
-\****************************************************************************************/
-
-namespace cv
-{
-
-template <typename T, typename WT>
-struct AddWeighted_SIMD
-{
-    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
-    {
-        return 0;
-    }
-};
-
-#if CV_SSE2
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
-
-            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
-            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
-
-            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                              _mm_cvtps_epi32(v_dstf1));
-
-            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
-    }
-
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE2)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                   _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE2;
-};
-
-#if CV_SSE4_1
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    AddWeighted_SIMD()
-    {
-        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
-    }
-
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        if (!haveSSE4_1)
-            return x;
-
-        __m128i v_zero = _mm_setzero_si128();
-        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
-               v_gamma = _mm_set1_ps(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
-            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
-
-            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
-
-            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
-            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
-                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
-
-            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
-                                                                    _mm_cvtps_epi32(v_dstf1)));
-        }
-
-        return x;
-    }
-
-    bool haveSSE4_1;
-};
-
-#endif
-
-#elif CV_NEON
-
-template <>
-struct AddWeighted_SIMD<schar, float>
-{
-    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32 (gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int8x8_t in1 = vld1_s8(src1 + x);
-            int16x8_t in1_16 = vmovl_s8(in1);
-            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
-            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
-
-            int8x8_t in2 = vld1_s8(src2+x);
-            int16x8_t in2_16 = vmovl_s8(in2);
-            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
-            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
-
-            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
-            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
-            out_f_l = vaddq_f32(out_f_l, g);
-            out_f_h = vaddq_f32(out_f_h, g);
-
-            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
-            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
-
-            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
-            int8x8_t out = vqmovn_s16(out_16);
-
-            vst1_s8(dst + x, out);
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<ushort, float>
-{
-    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
-            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
-            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-template <>
-struct AddWeighted_SIMD<short, float>
-{
-    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
-    {
-        int x = 0;
-
-        float32x4_t g = vdupq_n_f32(gamma);
-
-        for( ; x <= width - 8; x += 8 )
-        {
-            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
-
-            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
-            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
-            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
-            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
-            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
-
-            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
-        }
-
-        return x;
-    }
-};
-
-#endif
-
-template<typename T, typename WT> static void
-addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
-              T* dst, size_t step, Size size, void* _scalars )
-{
-    const double* scalars = (const double*)_scalars;
-    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    step /= sizeof(dst[0]);
-
-    AddWeighted_SIMD<T, WT> vop;
-
-    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = vop(src1, src2, dst, size.width, alpha, beta, gamma);
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
-            dst[x] = t0; dst[x+1] = t1;
-
-            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
-            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
-            dst[x+2] = t0; dst[x+3] = t1;
-        }
-        #endif
-        for( ; x < size.width; x++ )
-            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
-    }
-}
-
-
-static void
-addWeighted8u( const uchar* src1, size_t step1,
-               const uchar* src2, size_t step2,
-               uchar* dst, size_t step, Size size,
-               void* _scalars )
-{
-    const double* scalars = (const double*)_scalars;
-    float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
-
-    for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-    {
-        int x = 0;
-
-#if CV_SSE2
-        if( USE_SSE2 )
-        {
-            __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
-            __m128i z = _mm_setzero_si128();
-
-            for( ; x <= size.width - 8; x += 8 )
-            {
-                __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
-                __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
-
-                __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
-                __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
-                __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
-                __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
-
-                u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
-                u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
-                u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
-
-                u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
-                u = _mm_packus_epi16(u, u);
-
-                _mm_storel_epi64((__m128i*)(dst + x), u);
-            }
-        }
-#elif CV_NEON
-        float32x4_t g = vdupq_n_f32 (gamma);
-
-        for( ; x <= size.width - 8; x += 8 )
-        {
-            uint8x8_t in1 = vld1_u8(src1+x);
-            uint16x8_t in1_16 = vmovl_u8(in1);
-            float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
-            float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
-
-            uint8x8_t in2 = vld1_u8(src2+x);
-            uint16x8_t in2_16 = vmovl_u8(in2);
-            float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
-            float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
-
-            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
-            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
-            out_f_l = vaddq_f32(out_f_l, g);
-            out_f_h = vaddq_f32(out_f_h, g);
-
-            uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
-            uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
-
-            uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
-            uint8x8_t out = vqmovn_u16(out_16);
-
-            vst1_u8(dst+x, out);
-        }
-#endif
-        #if CV_ENABLE_UNROLLED
-        for( ; x <= size.width - 4; x += 4 )
-        {
-            float t0, t1;
-            t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
-
-            dst[x] = saturate_cast<uchar>(t0);
-            dst[x+1] = saturate_cast<uchar>(t1);
-
-            t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
-            t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
-
-            dst[x+2] = saturate_cast<uchar>(t0);
-            dst[x+3] = saturate_cast<uchar>(t1);
-        }
-        #endif
-
-        for( ; x < size.width; x++ )
-        {
-            float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
-            dst[x] = saturate_cast<uchar>(t0);
-        }
-    }
-}
-
-static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
-                           schar* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                            ushort* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
-                            short* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
-                            int* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
-                            float* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
-                            double* dst, size_t step, Size sz, void* scalars )
-{
-    addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
-}
-
-static BinaryFunc* getAddWeightedTab()
-{
-    static BinaryFunc addWeightedTab[] =
-    {
-        (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
-        (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
-        (BinaryFunc)addWeighted64f, 0
-    };
-
-    return addWeightedTab;
-}
-
-}
-
-void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
-                      double beta, double gamma, OutputArray dst, int dtype )
-{
-    double scalars[] = {alpha, beta, gamma};
-    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
-}
-
-
-/****************************************************************************************\
-*                                          compare                                       *
-\****************************************************************************************/
-
-namespace cv
-{
-
-template <typename T>
-struct Cmp_SIMD
-{
-    explicit Cmp_SIMD(int)
-    {
-    }
-
-    int operator () (const T *, const T *, uchar *, int) const
-    {
-        return 0;
-    }
-};
-
-#if CV_NEON
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdupq_n_u8(255);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
-
-        return x;
-    }
-
-    int code;
-    uint8x16_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<ushort>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, vmovn_u16(v_dst));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
-                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-template <>
-struct Cmp_SIMD<float>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        v_mask = vdup_n_u8(255);
-    }
-
-    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
-                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
-                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
-                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
-            }
-
-        return x;
-    }
-
-    int code;
-    uint8x8_t v_mask;
-};
-
-#elif CV_SSE2
-
-template <>
-struct Cmp_SIMD<schar>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi8(-1);
-    }
-
-    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
-
-        if (code == CMP_GT)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_LE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 16; x += 16)
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
-        else if (code == CMP_NE)
-            for ( ; x <= width - 16; x += 16)
-            {
-                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
-                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
-            }
-
-        return x;
-    }
-
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
-
-template <>
-struct Cmp_SIMD<int>
-{
-    explicit Cmp_SIMD(int code_) :
-        code(code_)
-    {
-        CV_Assert(code == CMP_GT || code == CMP_LE ||
-                  code == CMP_EQ || code == CMP_NE);
-
-        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
-
-        v_mask = _mm_set1_epi32(0xffffffff);
-    }
-
-    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
-    {
-        int x = 0;
-
-        if (!haveSSE)
-            return x;
+        _buf.allocate(bufesz*blocksize + 64);
+        buf = _buf;
+        if( cvtsrc1 )
+            buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
+        buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
+        wbuf = maskbuf = buf;
+        if( cvtdst )
+            buf = alignPtr(buf + blocksize*wsz, 16);
+        if( haveMask )
+            maskbuf = buf;
 
-        if (code == CMP_GT)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+        convertAndUnrollScalar( src2, wtype, buf2, blocksize);
 
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_LE)
-            for ( ; x <= width - 8; x += 8)
+        for( size_t i = 0; i < it.nplanes; i++, ++it )
+        {
+            for( size_t j = 0; j < total; j += blocksize )
             {
-                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+                int bsz = (int)MIN(total - j, blocksize);
+                Size bszn(bsz*cn, 1);
+                const uchar *sptr1 = ptrs[0];
+                const uchar* sptr2 = buf2;
+                uchar* dptr = ptrs[1];
 
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
-            }
-        else if (code == CMP_EQ)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+                if( cvtsrc1 )
+                {
+                    cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
+                    sptr1 = buf1;
+                }
 
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
-            }
-        else if (code == CMP_NE)
-            for ( ; x <= width - 8; x += 8)
-            {
-                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
-                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
-                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+                if( swapped12 )
+                    std::swap(sptr1, sptr2);
 
-                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
+                if( !haveMask && !cvtdst )
+                    func( sptr1, 1, sptr2, 1, dptr, 1, bszn.width, bszn.height, usrdata );
+                else
+                {
+                    func( sptr1, 1, sptr2, 1, wbuf, 1, bszn.width, bszn.height, usrdata );
+                    if( !haveMask )
+                        cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
+                    else if( !cvtdst )
+                    {
+                        copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
+                        ptrs[2] += bsz;
+                    }
+                    else
+                    {
+                        cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
+                        copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
+                        ptrs[2] += bsz;
+                    }
+                }
+                ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
             }
-
-        return x;
+        }
     }
+}
 
-    int code;
-    __m128i v_mask;
-    bool haveSSE;
-};
+static BinaryFuncC* getAddTab()
+{
+    static BinaryFuncC addTab[] =
+    {
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::add16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::add32f), (BinaryFuncC)cv::hal::add64f,
+        0
+    };
 
-#endif
+    return addTab;
+}
 
-template<typename T> static void
-cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
-     uchar* dst, size_t step, Size size, int code)
+static BinaryFuncC* getSubTab()
 {
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
+    static BinaryFuncC subTab[] =
     {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::sub32f), (BinaryFuncC)cv::hal::sub64f,
+        0
+    };
 
-    Cmp_SIMD<T> vop(code);
+    return subTab;
+}
 
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = vop(src1, src2, dst, size.width);
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= size.width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] > src2[x]) ^ m;
-                t1 = -(src1[x+1] > src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] > src2[x+2]) ^ m;
-                t1 = -(src1[x+3] > src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < size.width; x++ )
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
+static BinaryFuncC* getAbsDiffTab()
+{
+    static BinaryFuncC absDiffTab[] =
     {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_ENABLE_UNROLLED
-            for( ; x <= size.width - 4; x += 4 )
-            {
-                int t0, t1;
-                t0 = -(src1[x] == src2[x]) ^ m;
-                t1 = -(src1[x+1] == src2[x+1]) ^ m;
-                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
-                t0 = -(src1[x+2] == src2[x+2]) ^ m;
-                t1 = -(src1[x+3] == src2[x+3]) ^ m;
-                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
-            }
-            #endif
-            for( ; x < size.width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::absdiff32f), (BinaryFuncC)cv::hal::absdiff64f,
+        0
+    };
+
+    return absDiffTab;
+}
+
 }
 
-#if ARITHM_USE_IPP
-inline static IppCmpOp convert_cmp(int _cmpop)
+void cv::add( InputArray src1, InputArray src2, OutputArray dst,
+          InputArray mask, int dtype )
 {
-    return _cmpop == CMP_EQ ? ippCmpEq :
-        _cmpop == CMP_GT ? ippCmpGreater :
-        _cmpop == CMP_GE ? ippCmpGreaterEq :
-        _cmpop == CMP_LT ? ippCmpLess :
-        _cmpop == CMP_LE ? ippCmpLessEq :
-        (IppCmpOp)-1;
+    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
 }
-#endif
 
-static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
+               InputArray mask, int dtype )
 {
-#if ARITHM_USE_IPP
-    CV_IPP_CHECK()
-    {
-        IppCmpOp op = convert_cmp(*(int *)_cmpop);
-        if( op  >= 0 )
-        {
-            fixSteps(size, sizeof(dst[0]), step1, step2, step);
-            if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-  //vz optimized  cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::useTegra())
     {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
+        int kind1 = _src1.kind(), kind2 = _src2.kind();
+        Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+        bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
+        bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
 
-    if( code == CMP_GT || code == CMP_LE )
-    {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
+        if (!src1Scalar && !src2Scalar &&
+            src1.depth() == CV_8U && src2.type() == src1.type() &&
+            src1.dims == 2 && src2.size() == src1.size() &&
+            mask.empty())
         {
-            int x =0;
-            #if CV_SSE2
-            if( USE_SSE2 )
+            if (dtype < 0)
             {
-                __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
-                __m128i c128 = _mm_set1_epi8 (-128);
-                for( ; x <= size.width - 16; x += 16 )
+                if (_dst.fixedType())
                 {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    // no simd for 8u comparison, that's why we need the trick
-                    r00 = _mm_sub_epi8(r00,c128);
-                    r10 = _mm_sub_epi8(r10,c128);
-
-                    r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
-                    _mm_storeu_si128((__m128i*)(dst + x),r00);
-
+                    dtype = _dst.depth();
+                }
+                else
+                {
+                    dtype = src1.depth();
                 }
-            }
-            #elif CV_NEON
-            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
-
-            for( ; x <= size.width - 16; x += 16 )
-            {
-                vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
             }
 
-           #endif
+            dtype = CV_MAT_DEPTH(dtype);
 
-            for( ; x < size.width; x++ ){
-                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_SSE2
-            if( USE_SSE2 )
+            if (!_dst.fixedType() || dtype == _dst.depth())
             {
-                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
-                for( ; x <= size.width - 16; x += 16 )
+                _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
+
+                if (dtype == CV_16S)
                 {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
-                    _mm_storeu_si128((__m128i*)(dst + x), r00);
+                    Mat dst = _dst.getMat();
+                    if(tegra::subtract_8u8u16s(src1, src2, dst))
+                        return;
+                }
+                else if (dtype == CV_32F)
+                {
+                    Mat dst = _dst.getMat();
+                    if(tegra::subtract_8u8u32f(src1, src2, dst))
+                        return;
+                }
+                else if (dtype == CV_8S)
+                {
+                    Mat dst = _dst.getMat();
+                    if(tegra::subtract_8u8u8s(src1, src2, dst))
+                        return;
                 }
             }
-            #elif CV_NEON
-            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
-
-            for( ; x <= size.width - 16; x += 16 )
-            {
-                vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
-            }
-           #endif
-           for( ; x < size.width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
         }
     }
+#endif
+    arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
 }
 
-static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
 {
-    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
 }
 
-static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+/****************************************************************************************\
+*                                    multiply/divide                                     *
+\****************************************************************************************/
+
+namespace cv
+{
+
+static BinaryFuncC* getMulTab()
 {
-#if ARITHM_USE_IPP
-    CV_IPP_CHECK()
+    static BinaryFuncC mulTab[] =
     {
-        IppCmpOp op = convert_cmp(*(int *)_cmpop);
-        if( op  >= 0 )
-        {
-            fixSteps(size, sizeof(dst[0]), step1, step2, step);
-            if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+        (BinaryFuncC)cv::hal::mul8u, (BinaryFuncC)cv::hal::mul8s, (BinaryFuncC)cv::hal::mul16u,
+        (BinaryFuncC)cv::hal::mul16s, (BinaryFuncC)cv::hal::mul32s, (BinaryFuncC)cv::hal::mul32f,
+        (BinaryFuncC)cv::hal::mul64f, 0
+    };
+
+    return mulTab;
 }
 
-static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+static BinaryFuncC* getDivTab()
 {
-#if ARITHM_USE_IPP
-    CV_IPP_CHECK()
+    static BinaryFuncC divTab[] =
     {
-        IppCmpOp op = convert_cmp(*(int *)_cmpop);
-        if( op >= 0 )
-        {
-            fixSteps(size, sizeof(dst[0]), step1, step2, step);
-            if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-   //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+        (BinaryFuncC)cv::hal::div8u, (BinaryFuncC)cv::hal::div8s, (BinaryFuncC)cv::hal::div16u,
+        (BinaryFuncC)cv::hal::div16s, (BinaryFuncC)cv::hal::div32s, (BinaryFuncC)cv::hal::div32f,
+        (BinaryFuncC)cv::hal::div64f, 0
+    };
 
-    int code = *(int*)_cmpop;
-    step1 /= sizeof(src1[0]);
-    step2 /= sizeof(src2[0]);
-    if( code == CMP_GE || code == CMP_LT )
-    {
-        std::swap(src1, src2);
-        std::swap(step1, step2);
-        code = code == CMP_GE ? CMP_LE : CMP_GT;
-    }
+    return divTab;
+}
 
-    if( code == CMP_GT || code == CMP_LE )
+static BinaryFuncC* getRecipTab()
+{
+    static BinaryFuncC recipTab[] =
     {
-        int m = code == CMP_GT ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x =0;
-            #if CV_SSE2
-            if( USE_SSE2)
-            {
-                __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
-                for( ; x <= size.width - 16; x += 16 )
-                {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
-                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
-                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
-                    r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
-                    r11 = _mm_packs_epi16(r00, r01);
-                    _mm_storeu_si128((__m128i*)(dst + x), r11);
-                }
-                if( x <= size.width-8)
-                {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
-                    r10 = _mm_packs_epi16(r00, r00);
-                    _mm_storel_epi64((__m128i*)(dst + x), r10);
+        (BinaryFuncC)cv::hal::recip8u, (BinaryFuncC)cv::hal::recip8s, (BinaryFuncC)cv::hal::recip16u,
+        (BinaryFuncC)cv::hal::recip16s, (BinaryFuncC)cv::hal::recip32s, (BinaryFuncC)cv::hal::recip32f,
+        (BinaryFuncC)cv::hal::recip64f, 0
+    };
 
-                    x += 8;
-                }
-            }
-            #elif CV_NEON
-            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
+    return recipTab;
+}
 
-            for( ; x <= size.width - 16; x += 16 )
-            {
-                int16x8_t in1 = vld1q_s16(src1 + x);
-                int16x8_t in2 = vld1q_s16(src2 + x);
-                uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
+}
 
-                in1 = vld1q_s16(src1 + x + 8);
-                in2 = vld1q_s16(src2 + x + 8);
-                uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
+void cv::multiply(InputArray src1, InputArray src2,
+                  OutputArray dst, double scale, int dtype)
+{
+    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
+              true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
+}
 
-                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
-            }
-            #endif
+void cv::divide(InputArray src1, InputArray src2,
+                OutputArray dst, double scale, int dtype)
+{
+    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
+}
 
-            for( ; x < size.width; x++ ){
-                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
-            }
-        }
-    }
-    else if( code == CMP_EQ || code == CMP_NE )
-    {
-        int m = code == CMP_EQ ? 0 : 255;
-        for( ; size.height--; src1 += step1, src2 += step2, dst += step )
-        {
-            int x = 0;
-            #if CV_SSE2
-            if( USE_SSE2 )
-            {
-                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
-                for( ; x <= size.width - 16; x += 16 )
-                {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
-                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
-                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
-                    r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
-                    r11 = _mm_packs_epi16(r00, r01);
-                    _mm_storeu_si128((__m128i*)(dst + x), r11);
-                }
-                if( x <= size.width - 8)
-                {
-                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
-                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
-                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
-                    r10 = _mm_packs_epi16(r00, r00);
-                    _mm_storel_epi64((__m128i*)(dst + x), r10);
+void cv::divide(double scale, InputArray src2,
+                OutputArray dst, int dtype)
+{
+    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
+}
 
-                    x += 8;
-                }
-            }
-            #elif CV_NEON
-            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
+/****************************************************************************************\
+*                                      addWeighted                                       *
+\****************************************************************************************/
 
-            for( ; x <= size.width - 16; x += 16 )
-            {
-                int16x8_t in1 = vld1q_s16(src1 + x);
-                int16x8_t in2 = vld1q_s16(src2 + x);
-                uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
+namespace cv
+{
 
-                in1 = vld1q_s16(src1 + x + 8);
-                in2 = vld1q_s16(src2 + x + 8);
-                uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
+static BinaryFuncC* getAddWeightedTab()
+{
+    static BinaryFuncC addWeightedTab[] =
+    {
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted8s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16u),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted16s), (BinaryFuncC)GET_OPTIMIZED(cv::hal::addWeighted32s), (BinaryFuncC)cv::hal::addWeighted32f,
+        (BinaryFuncC)cv::hal::addWeighted64f, 0
+    };
 
-                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
-            }
-            #endif
-            for( ; x < size.width; x++ )
-                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
-        }
-    }
+    return addWeightedTab;
 }
 
-static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
-                   uchar* dst, size_t step, Size size, void* _cmpop)
-{
-    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
 }
 
-static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
+                      double beta, double gamma, OutputArray dst, int dtype )
 {
-#if ARITHM_USE_IPP
-    CV_IPP_CHECK()
-    {
-        IppCmpOp op = convert_cmp(*(int *)_cmpop);
-        if( op  >= 0 )
-        {
-            fixSteps(size, sizeof(dst[0]), step1, step2, step);
-            if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
-            {
-                CV_IMPL_ADD(CV_IMPL_IPP);
-                return;
-            }
-            setIppErrorStatus();
-        }
-    }
-#endif
-    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
+    double scalars[] = {alpha, beta, gamma};
+    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
 }
 
-static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
-                  uchar* dst, size_t step, Size size, void* _cmpop)
+
+/****************************************************************************************\
+*                                          compare                                       *
+\****************************************************************************************/
+
+namespace cv
 {
-    cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
-}
 
-static BinaryFunc getCmpFunc(int depth)
+static BinaryFuncC getCmpFunc(int depth)
 {
-    static BinaryFunc cmpTab[] =
+    static BinaryFuncC cmpTab[] =
     {
-        (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
-        (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
-        (BinaryFunc)GET_OPTIMIZED(cmp32s),
-        (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp8s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16u), (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp16s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32s),
+        (BinaryFuncC)GET_OPTIMIZED(cv::hal::cmp32f), (BinaryFuncC)cv::hal::cmp64f,
         0
     };
 
@@ -5020,7 +1230,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
         _dst.create(src1.size(), CV_8UC(cn));
         Mat dst = _dst.getMat();
         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
-        getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op);
+        getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz.width, sz.height, &op);
         return;
     }
 
@@ -5032,7 +1242,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
 
     size_t esz = src1.elemSize();
     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
-    BinaryFunc func = getCmpFunc(depth1);
+    BinaryFuncC func = getCmpFunc(depth1);
 
     if( !haveScalar )
     {
@@ -5043,7 +1253,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
         size_t total = it.size;
 
         for( size_t i = 0; i < it.nplanes; i++, ++it )
-            func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
+            func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, (int)total, 1, &op );
     }
     else
     {
@@ -5095,7 +1305,7 @@ void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
             for( size_t j = 0; j < total; j += blocksize )
             {
                 int bsz = (int)MIN(total - j, blocksize);
-                func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
+                func( ptrs[0], 0, buf, 0, ptrs[1], 0, bsz, 1, &op);
                 ptrs[0] += bsz*esz;
                 ptrs[1] += bsz;
             }
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index fbbea5e1b169374ea2a00fe5fa1b9157ebf593ed..6c693a43a07b19bbbead6abf287acb9a38ab0249 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -42,6 +42,7 @@
 //M*/
 
 #include "precomp.hpp"
+
 #include "opencl_kernels_core.hpp"
 
 #ifdef __APPLE__
@@ -49,776 +50,37 @@
 #define CV_NEON 0
 #endif
 
-namespace cv
-{
 
 /****************************************************************************************\
 *                                       split & merge                                    *
 \****************************************************************************************/
 
-#if CV_NEON
-template<typename T> struct VSplit2;
-template<typename T> struct VSplit3;
-template<typename T> struct VSplit4;
-
-#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0,                    \
-                        data_type* dst1) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-        }                                                                         \
-    }
-
-#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2) const                                    \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-        }                                                                         \
-    }
-
-#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>                                                        \
-    {                                                                             \
-        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
-                        data_type* dst2, data_type* dst3) const                   \
-        {                                                                         \
-            reg_type r = load_func(src);                                          \
-            store_func(dst0, r.val[0]);                                           \
-            store_func(dst1, r.val[1]);                                           \
-            store_func(dst2, r.val[2]);                                           \
-            store_func(dst3, r.val[3]);                                           \
-        }                                                                         \
-    }
-
-SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
-SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
-SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
-
-SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
-SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
-SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
-
-SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
-SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
-SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VSplit2
-{
-    VSplit2() : support(false) { }
-    void operator()(const T *, T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VSplit3
-{
-    VSplit3() : support(false) { }
-    void operator()(const T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VSplit4
-{
-    VSplit4() : support(false) { }
-    void operator()(const T *, T *, T *, T *, T *) const { }
-
-    bool support;
-};
-
-#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src,                                                 \
-                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
-                         v_src3, v_src4, v_src5);                                          \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
-template <>                                                                                \
-struct VSplit4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VSplit4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
-                    data_type * dst2, data_type * dst3) const                              \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
-        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
-        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
-        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
-        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
-        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
-        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
-        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
-                                                                                           \
-        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
-                         v_src4, v_src5, v_src6, v_src7);                                  \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
-        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
-        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
-        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
-        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
-SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
-SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
-
-#endif
-
-template<typename T> static void
-split_( const T* src, T** dst, int len, int cn )
-{
-    int k = cn % 4 ? cn % 4 : 4;
-    int i, j;
-    if( k == 1 )
-    {
-        T* dst0 = dst[0];
-
-        if(cn == 1)
-        {
-            memcpy(dst0, src, len * sizeof(T));
-        }
-        else
-        {
-            for( i = 0, j = 0 ; i < len; i++, j += cn )
-                dst0[i] = src[j];
-        }
-    }
-    else if( k == 2 )
-    {
-        T *dst0 = dst[0], *dst1 = dst[1];
-        i = j = 0;
-
-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i);
-        }
-#elif CV_SSE2
-        if (cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VSplit2<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i);
-            }
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst0[i] = src[j];
-            dst1[i] = src[j+1];
-        }
-    }
-    else if( k == 3 )
-    {
-        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
-        i = j = 0;
-
-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-        }
-#elif CV_SSE2
-        if (cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VSplit3<T> vsplit;
-
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
-            }
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst0[i] = src[j];
-            dst1[i] = src[j+1];
-            dst2[i] = src[j+2];
-        }
-    }
-    else
-    {
-        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
-        i = j = 0;
-
-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-        }
-#elif CV_SSE2
-        if (cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VSplit4<T> vsplit;
-            if (vsplit.support)
-            {
-                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
-                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
-            }
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst0[i] = src[j]; dst1[i] = src[j+1];
-            dst2[i] = src[j+2]; dst3[i] = src[j+3];
-        }
-    }
-
-    for( ; k < cn; k += 4 )
-    {
-        T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
-        for( i = 0, j = k; i < len; i++, j += cn )
-        {
-            dst0[i] = src[j]; dst1[i] = src[j+1];
-            dst2[i] = src[j+2]; dst3[i] = src[j+3];
-        }
-    }
-}
-
-
-#if CV_NEON
-template<typename T> struct VMerge2;
-template<typename T> struct VMerge3;
-template<typename T> struct VMerge4;
-
-#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, data_type* dst){                   \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
-    template<>                                                                    \
-    struct name<data_type>{                                                       \
-        void operator()(const data_type* src0, const data_type* src1,             \
-                        const data_type* src2, const data_type* src3,             \
-                        data_type* dst){                                          \
-            reg_type r;                                                           \
-            r.val[0] = load_func(src0);                                           \
-            r.val[1] = load_func(src1);                                           \
-            r.val[2] = load_func(src2);                                           \
-            r.val[3] = load_func(src3);                                           \
-            store_func(dst, r);                                                   \
-        }                                                                         \
-    }
-
-MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
-MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
-MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
-
-MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
-MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
-MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
-
-MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
-MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
-MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
-
-#elif CV_SSE2
-
-template <typename T>
-struct VMerge2
-{
-    VMerge2() : support(false) { }
-    void operator()(const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VMerge3
-{
-    VMerge3() : support(false) { }
-    void operator()(const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-template <typename T>
-struct VMerge4
-{
-    VMerge4() : support(false) { }
-    void operator()(const T *, const T *, const T *, const T *, T *) const { }
-
-    bool support;
-};
-
-#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge2<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge2()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge3<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge3()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2,                                             \
-                       v_src3, v_src4, v_src5);                                            \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
-template <>                                                                                \
-struct VMerge4<data_type>                                                                  \
-{                                                                                          \
-    enum                                                                                   \
-    {                                                                                      \
-        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
-    };                                                                                     \
-                                                                                           \
-    VMerge4()                                                                              \
-    {                                                                                      \
-        support = checkHardwareSupport(se);                                                \
-    }                                                                                      \
-                                                                                           \
-    void operator()(const data_type * src0, const data_type * src1,                        \
-                    const data_type * src2, const data_type * src3,                        \
-                    data_type * dst) const                                                 \
-    {                                                                                      \
-        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
-        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
-        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
-        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
-        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
-        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
-        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
-        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
-                                                                                           \
-        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
-                       v_src4, v_src5, v_src6, v_src7);                                    \
-                                                                                           \
-        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
-        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
-    }                                                                                      \
-                                                                                           \
-    bool support;                                                                          \
-}
-
-MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
-
-#if CV_SSE4_1
-MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
-#endif
-
-MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
-
-#endif
-
-template<typename T> static void
-merge_( const T** src, T* dst, int len, int cn )
-{
-    int k = cn % 4 ? cn % 4 : 4;
-    int i, j;
-    if( k == 1 )
-    {
-        const T* src0 = src[0];
-        for( i = j = 0; i < len; i++, j += cn )
-            dst[j] = src0[i];
-    }
-    else if( k == 2 )
-    {
-        const T *src0 = src[0], *src1 = src[1];
-        i = j = 0;
-#if CV_NEON
-        if(cn == 2)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 2)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 2 * inc_i;
-
-            VMerge2<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, dst + j);
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst[j] = src0[i];
-            dst[j+1] = src1[i];
-        }
-    }
-    else if( k == 3 )
-    {
-        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
-        i = j = 0;
-#if CV_NEON
-        if(cn == 3)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 3)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 3 * inc_i;
-
-            VMerge3<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst[j] = src0[i];
-            dst[j+1] = src1[i];
-            dst[j+2] = src2[i];
-        }
-    }
-    else
-    {
-        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
-        i = j = 0;
-#if CV_NEON
-        if(cn == 4)
-        {
-            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#elif CV_SSE2
-        if(cn == 4)
-        {
-            int inc_i = 32/sizeof(T);
-            int inc_j = 4 * inc_i;
-
-            VMerge4<T> vmerge;
-            if (vmerge.support)
-                for( ; i < len - inc_i; i += inc_i, j += inc_j)
-                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
-        }
-#endif
-        for( ; i < len; i++, j += cn )
-        {
-            dst[j] = src0[i]; dst[j+1] = src1[i];
-            dst[j+2] = src2[i]; dst[j+3] = src3[i];
-        }
-    }
-
-    for( ; k < cn; k += 4 )
-    {
-        const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
-        for( i = 0, j = k; i < len; i++, j += cn )
-        {
-            dst[j] = src0[i]; dst[j+1] = src1[i];
-            dst[j+2] = src2[i]; dst[j+3] = src3[i];
-        }
-    }
-}
-
-static void split8u(const uchar* src, uchar** dst, int len, int cn )
-{
-    split_(src, dst, len, cn);
-}
-
-static void split16u(const ushort* src, ushort** dst, int len, int cn )
-{
-    split_(src, dst, len, cn);
-}
-
-static void split32s(const int* src, int** dst, int len, int cn )
-{
-    split_(src, dst, len, cn);
-}
-
-static void split64s(const int64* src, int64** dst, int len, int cn )
-{
-    split_(src, dst, len, cn);
-}
-
-static void merge8u(const uchar** src, uchar* dst, int len, int cn )
-{
-    merge_(src, dst, len, cn);
-}
-
-static void merge16u(const ushort** src, ushort* dst, int len, int cn )
-{
-    merge_(src, dst, len, cn);
-}
-
-static void merge32s(const int** src, int* dst, int len, int cn )
-{
-    merge_(src, dst, len, cn);
-}
-
-static void merge64s(const int64** src, int64* dst, int len, int cn )
-{
-    merge_(src, dst, len, cn);
-}
-
 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
-typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
 
 static SplitFunc getSplitFunc(int depth)
 {
     static SplitFunc splitTab[] =
     {
-        (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u),
-        (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split8u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u), (SplitFunc)GET_OPTIMIZED(cv::hal::split16u),
+        (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split32s), (SplitFunc)GET_OPTIMIZED(cv::hal::split64s), 0
     };
 
     return splitTab[depth];
 }
 
+typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
+
 static MergeFunc getMergeFunc(int depth)
 {
     static MergeFunc mergeTab[] =
     {
-        (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u),
-        (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge8u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u), (MergeFunc)GET_OPTIMIZED(cv::hal::merge16u),
+        (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge32s), (MergeFunc)GET_OPTIMIZED(cv::hal::merge64s), 0
     };
 
     return mergeTab[depth];
 }
 
-}
-
 void cv::split(const Mat& src, Mat* mv)
 {
     int k, depth = src.depth(), cn = src.channels();
diff --git a/modules/core/src/precomp.hpp b/modules/core/src/precomp.hpp
index 6d19744820826569589bbd87783584d64dadb485..d1f2ec22e19c6e502f307b0b705da5183a464222 100644
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -83,6 +83,11 @@ typedef void (*BinaryFunc)(const uchar* src1, size_t step1,
                        uchar* dst, size_t step, Size sz,
                        void*);
 
+typedef void (*BinaryFuncC)(const uchar* src1, size_t step1,
+                       const uchar* src2, size_t step2,
+                       uchar* dst, size_t step, int width, int height,
+                       void*);
+
 BinaryFunc getConvertFunc(int sdepth, int ddepth);
 BinaryFunc getCopyMaskFunc(size_t esz);
 
@@ -114,46 +119,6 @@ extern const uchar g_Saturate8u[];
 void deleteThreadAllocData();
 #endif
 
-template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
-};
-
-template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
-{
-    typedef T1 type1;
-    typedef T2 type2;
-    typedef T3 rtype;
-    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
-};
-
-template<typename T> struct OpMin
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::min(a, b); }
-};
-
-template<typename T> struct OpMax
-{
-    typedef T type1;
-    typedef T type2;
-    typedef T rtype;
-    T operator ()(const T a, const T b) const { return std::max(a, b); }
-};
-
 inline Size getContinuousSize_( int flags, int cols, int rows, int widthScale )
 {
     int64 sz = (int64)cols * rows * widthScale;
@@ -201,11 +166,6 @@ struct NoVec
     size_t operator()(const void*, const void*, void*, size_t) const { return 0; }
 };
 
-extern volatile bool USE_SSE2;
-extern volatile bool USE_SSE4_2;
-extern volatile bool USE_AVX;
-extern volatile bool USE_AVX2;
-
 enum { BLOCK_SIZE = 1024 };
 
 #if defined HAVE_IPP && (IPP_VERSION_X100 >= 700)
diff --git a/modules/core/src/system.cpp b/modules/core/src/system.cpp
index dbe35ebfa40a21c21fc0305ac1b200d50d48099c..ba2c9d536f50c46736a0a9ca59a8762ea85b4239 100644
--- a/modules/core/src/system.cpp
+++ b/modules/core/src/system.cpp
@@ -86,45 +86,6 @@ Mutex* __initialization_mutex_initializer = &getInitializationMutex();
 #undef max
 #undef abs
 #include <tchar.h>
-#if defined _MSC_VER
-  #if _MSC_VER >= 1400
-    #include <intrin.h>
-  #elif defined _M_IX86
-    static void __cpuid(int* cpuid_data, int)
-    {
-        __asm
-        {
-            push ebx
-            push edi
-            mov edi, cpuid_data
-            mov eax, 1
-            cpuid
-            mov [edi], eax
-            mov [edi + 4], ebx
-            mov [edi + 8], ecx
-            mov [edi + 12], edx
-            pop edi
-            pop ebx
-        }
-    }
-    static void __cpuidex(int* cpuid_data, int, int)
-    {
-        __asm
-        {
-            push edi
-            mov edi, cpuid_data
-            mov eax, 7
-            mov ecx, 0
-            cpuid
-            mov [edi], eax
-            mov [edi + 4], ebx
-            mov [edi + 8], ecx
-            mov [edi + 12], edx
-            pop edi
-        }
-    }
-  #endif
-#endif
 
 #ifdef WINRT
 #include <wrl/client.h>
@@ -237,160 +198,15 @@ void Exception::formatMessage()
         msg = format("%s:%d: error: (%d) %s\n", file.c_str(), line, code, err.c_str());
 }
 
-struct HWFeatures
-{
-    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
-
-    HWFeatures(void)
-    {
-        memset( have, 0, sizeof(have) );
-        x86_family = 0;
-    }
-
-    static HWFeatures initialize(void)
-    {
-        HWFeatures f;
-        int cpuid_data[4] = { 0, 0, 0, 0 };
-
-    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-        __cpuid(cpuid_data, 1);
-    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-        #ifdef __x86_64__
-        asm __volatile__
-        (
-         "movl $1, %%eax\n\t"
-         "cpuid\n\t"
-         :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
-         :
-         : "cc"
-        );
-        #else
-        asm volatile
-        (
-         "pushl %%ebx\n\t"
-         "movl $1,%%eax\n\t"
-         "cpuid\n\t"
-         "popl %%ebx\n\t"
-         : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
-         :
-         : "cc"
-        );
-        #endif
-    #endif
-
-        f.x86_family = (cpuid_data[0] >> 8) & 15;
-        if( f.x86_family >= 6 )
-        {
-            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
-            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
-            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
-            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
-            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
-            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
-            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
-            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
-            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
-            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
-
-            // make the second call to the cpuid command in order to get
-            // information about extended features like AVX2
-        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
-            __cpuidex(cpuid_data, 7, 0);
-        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-            #ifdef __x86_64__
-            asm __volatile__
-            (
-             "movl $7, %%eax\n\t"
-             "movl $0, %%ecx\n\t"
-             "cpuid\n\t"
-             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
-             :
-             : "cc"
-            );
-            #else
-            asm volatile
-            (
-             "pushl %%ebx\n\t"
-             "movl $7,%%eax\n\t"
-             "movl $0,%%ecx\n\t"
-             "cpuid\n\t"
-             "movl %%ebx, %0\n\t"
-             "popl %%ebx\n\t"
-             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
-             :
-             : "cc"
-            );
-            #endif
-        #endif
-            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
-
-            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
-            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
-            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
-            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
-            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
-            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
-            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
-            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
-            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
-        }
-
-    #if defined ANDROID || defined __linux__
-    #ifdef __aarch64__
-        f.have[CV_CPU_NEON] = true;
-    #else
-        int cpufile = open("/proc/self/auxv", O_RDONLY);
-
-        if (cpufile >= 0)
-        {
-            Elf32_auxv_t auxv;
-            const size_t size_auxv_t = sizeof(auxv);
-
-            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
-            {
-                if (auxv.a_type == AT_HWCAP)
-                {
-                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
-                    break;
-                }
-            }
-
-            close(cpufile);
-        }
-    #endif
-    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
-        f.have[CV_CPU_NEON] = true;
-    #endif
-
-        return f;
-    }
-
-    int x86_family;
-    bool have[MAX_FEATURE+1];
-};
-
-static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
-static HWFeatures* currentFeatures = &featuresEnabled;
-
 bool checkHardwareSupport(int feature)
 {
     CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return currentFeatures->have[feature];
+    return cv::hal::checkHardwareSupport(feature);
 }
 
-
-volatile bool useOptimizedFlag = true;
-
-volatile bool USE_SSE2 = featuresEnabled.have[CV_CPU_SSE2];
-volatile bool USE_SSE4_2 = featuresEnabled.have[CV_CPU_SSE4_2];
-volatile bool USE_AVX = featuresEnabled.have[CV_CPU_AVX];
-volatile bool USE_AVX2 = featuresEnabled.have[CV_CPU_AVX2];
-
 void setUseOptimized( bool flag )
 {
-    useOptimizedFlag = flag;
-    currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
-    USE_SSE2 = currentFeatures->have[CV_CPU_SSE2];
+    cv::hal::setUseOptimized(flag);
 
     ipp::setUseIPP(flag);
 #ifdef HAVE_OPENCL
@@ -403,7 +219,7 @@ void setUseOptimized( bool flag )
 
 bool useOptimized(void)
 {
-    return useOptimizedFlag;
+    return cv::hal::useOptimized();
 }
 
 int64 getTickCount(void)
@@ -683,12 +499,12 @@ redirectError( CvErrorCallback errCallback, void* userdata, void** prevUserdata)
 CV_IMPL int cvCheckHardwareSupport(int feature)
 {
     CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
-    return cv::currentFeatures->have[feature];
+    return cv::hal::checkHardwareSupport(feature);
 }
 
 CV_IMPL int cvUseOptimized( int flag )
 {
-    int prevMode = cv::useOptimizedFlag;
+    int prevMode = cv::useOptimized();
     cv::setUseOptimized( flag != 0 );
     return prevMode;
 }
diff --git a/modules/hal/CMakeLists.txt b/modules/hal/CMakeLists.txt
index b04e96b9e7f92feb9b0e67025c30bf1582048cf3..982913dba7aab638abfecfabd23e728bee5d6f24 100644
--- a/modules/hal/CMakeLists.txt
+++ b/modules/hal/CMakeLists.txt
@@ -2,10 +2,20 @@ set(the_description "The Hardware Acceleration Layer (HAL) module")
 
 set(OPENCV_MODULE_TYPE STATIC)
 
+if(OPENCV_HAL_HEADERS AND OPENCV_HAL_LIBS)
+    set(OPENCV_HAL_HEADERS_INCLUDES "#include \"${OPENCV_HAL_HEADERS}\"")
+    set(DEPS "${OPENCV_HAL_LIBS}")
+else()
+    set(OPENCV_HAL_HEADERS_INCLUDES "// using default HAL")
+    set(DEPS "")
+endif()
+
+configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/custom_hal.hpp.in" "${CMAKE_BINARY_DIR}/custom_hal.hpp" @ONLY)
+
 if(UNIX)
   if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   endif()
 endif()
 
-ocv_define_module(hal)
+ocv_define_module(hal ${DEPS})
diff --git a/modules/hal/include/opencv2/hal.hpp b/modules/hal/include/opencv2/hal.hpp
index 9d448757d282431dd3ef4a41ebc8d5fe665f84cc..125bbc824811cb4b22ddfdccf3aca8d400e2228c 100644
--- a/modules/hal/include/opencv2/hal.hpp
+++ b/modules/hal/include/opencv2/hal.hpp
@@ -46,6 +46,7 @@
 #define __OPENCV_HAL_HPP__
 
 #include "opencv2/hal/defs.h"
+#include "opencv2/hal/interface.hpp"
 
 /**
   @defgroup hal Hardware Acceleration Layer
@@ -58,22 +59,19 @@
   @}
 */
 
-
 namespace cv { namespace hal {
 
 //! @addtogroup hal
 //! @{
 
-namespace Error {
-
-enum
+class Failure
 {
-    Ok = 0,
-    Unknown = -1
+public:
+    Failure(int code_ = Error::Unknown) : code(code_) {}
+public:
+    int code;
 };
 
-}
-
 int normHamming(const uchar* a, int n);
 int normHamming(const uchar* a, const uchar* b, int n);
 
@@ -104,8 +102,186 @@ void sqrt(const double* src, double* dst, int len);
 void invSqrt(const float* src, float* dst, int len);
 void invSqrt(const double* src, double* dst, int len);
 
+void split8u(const uchar* src, uchar** dst, int len, int cn );
+void split16u(const ushort* src, ushort** dst, int len, int cn );
+void split32s(const int* src, int** dst, int len, int cn );
+void split64s(const int64* src, int64** dst, int len, int cn );
+
+void merge8u(const uchar** src, uchar* dst, int len, int cn );
+void merge16u(const ushort** src, ushort* dst, int len, int cn );
+void merge32s(const int** src, int* dst, int len, int cn );
+void merge64s(const int64** src, int64* dst, int len, int cn );
+
+void add8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void add8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+void add16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+void add16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+void add32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+void add32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+void add64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+void sub8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void sub8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+void sub16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+void sub16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+void sub32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+void sub32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+void sub64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+void max8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void max8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+void max16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+void max16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+void max32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+void max32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+void max64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+void min8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void min8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+void min16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+void min16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+void min32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+void min32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+void min64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+void absdiff8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void absdiff8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* );
+void absdiff16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* );
+void absdiff16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* );
+void absdiff32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* );
+void absdiff32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* );
+void absdiff64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* );
+
+void and8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void or8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void xor8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+void not8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* );
+
+void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _cmpop);
+
+void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+void mul16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+void mul32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+void mul32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+void mul64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+void div16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+void div32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+void div32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+void div64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* scale);
+void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scale);
+void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scale);
+void recip16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scale);
+void recip32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scale);
+void recip32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scale);
+void recip64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scale);
+
+void addWeighted8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height, void* _scalars );
+void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2, schar* dst, size_t step, int width, int height, void* scalars );
+void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2, ushort* dst, size_t step, int width, int height, void* scalars );
+void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2, short* dst, size_t step, int width, int height, void* scalars );
+void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2, int* dst, size_t step, int width, int height, void* scalars );
+void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2, float* dst, size_t step, int width, int height, void* scalars );
+void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2, double* dst, size_t step, int width, int height, void* scalars );
 //! @}
 
 }} //cv::hal
 
+namespace cv {
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpAdd
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a + b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(a - b); }
+};
+
+template<typename T1, typename T2=T1, typename T3=T1> struct OpRSub
+{
+    typedef T1 type1;
+    typedef T2 type2;
+    typedef T3 rtype;
+    T3 operator ()(const T1 a, const T2 b) const { return saturate_cast<T3>(b - a); }
+};
+
+template<typename T> struct OpMin
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::min(a, b); }
+};
+
+template<typename T> struct OpMax
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator ()(const T a, const T b) const { return std::max(a, b); }
+};
+
+template<typename T> struct OpAbsDiff
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()(T a, T b) const { return a > b ? a - b : b - a; }
+};
+
+template<typename T> struct OpAnd
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a & b; }
+};
+
+template<typename T> struct OpOr
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a | b; }
+};
+
+template<typename T> struct OpXor
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T b ) const { return a ^ b; }
+};
+
+template<typename T> struct OpNot
+{
+    typedef T type1;
+    typedef T type2;
+    typedef T rtype;
+    T operator()( T a, T ) const { return ~a; }
+};
+
+}
+
 #endif //__OPENCV_HAL_HPP__
diff --git a/modules/hal/include/opencv2/hal/defs.h b/modules/hal/include/opencv2/hal/defs.h
index d04f003879c290fb2361c01fba1f73825bce6a00..117ec6046921195b2f24bad86e8daecedf8d12a3 100644
--- a/modules/hal/include/opencv2/hal/defs.h
+++ b/modules/hal/include/opencv2/hal/defs.h
@@ -53,6 +53,7 @@
 #endif
 
 #include <limits.h>
+#include "opencv2/hal/interface.hpp"
 
 #if defined __ICL
 #  define CV_ICC   __ICL
@@ -117,9 +118,38 @@
 
 #define CV_CPU_NEON   100
 
-// when adding to this list remember to update the enum in core/utility.cpp
+// when adding to this list remember to update the following enum
 #define CV_HARDWARE_MAX_FEATURE 255
 
+/** @brief Available CPU features.
+*/
+enum CpuFeatures {
+    CPU_MMX             = 1,
+    CPU_SSE             = 2,
+    CPU_SSE2            = 3,
+    CPU_SSE3            = 4,
+    CPU_SSSE3           = 5,
+    CPU_SSE4_1          = 6,
+    CPU_SSE4_2          = 7,
+    CPU_POPCNT          = 8,
+
+    CPU_AVX             = 10,
+    CPU_AVX2            = 11,
+    CPU_FMA3            = 12,
+
+    CPU_AVX_512F        = 13,
+    CPU_AVX_512BW       = 14,
+    CPU_AVX_512CD       = 15,
+    CPU_AVX_512DQ       = 16,
+    CPU_AVX_512ER       = 17,
+    CPU_AVX_512IFMA512  = 18,
+    CPU_AVX_512PF       = 19,
+    CPU_AVX_512VBMI     = 20,
+    CPU_AVX_512VL       = 21,
+
+    CPU_NEON            = 100
+};
+
 // do not include SSE/AVX/NEON headers for NVCC compiler
 #ifndef __CUDACC__
 
@@ -257,49 +287,6 @@
 #  define CV_VFP 0
 #endif
 
-/* primitive types */
-/*
-  schar  - signed 1 byte integer
-  uchar  - unsigned 1 byte integer
-  short  - signed 2 byte integer
-  ushort - unsigned 2 byte integer
-  int    - signed 4 byte integer
-  uint   - unsigned 4 byte integer
-  int64  - signed 8 byte integer
-  uint64 - unsigned 8 byte integer
-*/
-
-#if !defined _MSC_VER && !defined __BORLANDC__
-#  if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
-#    include <cstdint>
-     typedef std::uint32_t uint;
-#  else
-#    include <stdint.h>
-     typedef uint32_t uint;
-#  endif
-#else
-   typedef unsigned uint;
-#endif
-
-typedef signed char schar;
-
-#ifndef __IPL_H__
-   typedef unsigned char uchar;
-   typedef unsigned short ushort;
-#endif
-
-#if defined _MSC_VER || defined __BORLANDC__
-   typedef __int64 int64;
-   typedef unsigned __int64 uint64;
-#  define CV_BIG_INT(n)   n##I64
-#  define CV_BIG_UINT(n)  n##UI64
-#else
-   typedef int64_t int64;
-   typedef uint64_t uint64;
-#  define CV_BIG_INT(n)   n##LL
-#  define CV_BIG_UINT(n)  n##ULL
-#endif
-
 /* fundamental constants */
 #define CV_PI   3.1415926535897932384626433832795
 #define CV_2PI 6.283185307179586476925286766559
@@ -321,6 +308,19 @@ typedef union Cv64suf
 }
 Cv64suf;
 
+namespace cv { namespace hal {
+
+bool checkHardwareSupport(int feature);
+void setUseOptimized(bool onoff);
+bool useOptimized();
+
+}}
+
+#define USE_SSE2  (cv::hal::checkHardwareSupport(CV_CPU_SSE))
+#define USE_SSE4_2  (cv::hal::checkHardwareSupport(CV_CPU_SSE4_2))
+#define USE_AVX  (cv::hal::checkHardwareSupport(CV_CPU_AVX))
+#define USE_AVX2  (cv::hal::checkHardwareSupport(CV_CPU_AVX2))
+
 
 /****************************************************************************************\
 *                                      fast math                                         *
diff --git a/modules/hal/include/opencv2/hal/interface.hpp b/modules/hal/include/opencv2/hal/interface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a5bff04d7e5eb86f5df94c97ffa83e5eab02821
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/interface.hpp
@@ -0,0 +1,91 @@
+#ifndef _HAL_INTERFACE_HPP_INCLUDED_
+#define _HAL_INTERFACE_HPP_INCLUDED_
+
+#define CV_HAL_ERROR_OK 0
+#define CV_HAL_ERROR_NI 1
+#define CV_HAL_ERROR_UNKNOWN -1
+
+#define CV_HAL_CMP_EQ 0
+#define CV_HAL_CMP_GT 1
+#define CV_HAL_CMP_GE 2
+#define CV_HAL_CMP_LT 3
+#define CV_HAL_CMP_LE 4
+#define CV_HAL_CMP_NE 5
+
+#ifdef __cplusplus
+namespace cv { namespace hal {
+
+namespace Error {
+
+enum
+{
+    Ok = 0,
+    NotImplemented = 1,
+    Unknown = -1
+};
+
+}
+
+enum
+{
+    CMP_EQ = 0,
+    CMP_GT = 1,
+    CMP_GE = 2,
+    CMP_LT = 3,
+    CMP_LE = 4,
+    CMP_NE = 5
+};
+
+}}
+#endif
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stddef.h>
+#endif
+
+/* primitive types */
+/*
+  schar  - signed 1 byte integer
+  uchar  - unsigned 1 byte integer
+  short  - signed 2 byte integer
+  ushort - unsigned 2 byte integer
+  int    - signed 4 byte integer
+  uint   - unsigned 4 byte integer
+  int64  - signed 8 byte integer
+  uint64 - unsigned 8 byte integer
+*/
+
+#if !defined _MSC_VER && !defined __BORLANDC__
+#  if defined __cplusplus && __cplusplus >= 201103L && !defined __APPLE__
+#    include <cstdint>
+     typedef std::uint32_t uint;
+#  else
+#    include <stdint.h>
+     typedef uint32_t uint;
+#  endif
+#else
+   typedef unsigned uint;
+#endif
+
+typedef signed char schar;
+
+#ifndef __IPL_H__
+   typedef unsigned char uchar;
+   typedef unsigned short ushort;
+#endif
+
+#if defined _MSC_VER || defined __BORLANDC__
+   typedef __int64 int64;
+   typedef unsigned __int64 uint64;
+#  define CV_BIG_INT(n)   n##I64
+#  define CV_BIG_UINT(n)  n##UI64
+#else
+   typedef int64_t int64;
+   typedef uint64_t uint64;
+#  define CV_BIG_INT(n)   n##LL
+#  define CV_BIG_UINT(n)  n##ULL
+#endif
+
+#endif
diff --git a/modules/hal/include/opencv2/hal/neon_utils.hpp b/modules/hal/include/opencv2/hal/neon_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6026777a6f8591891010335cde349ac4117df493
--- /dev/null
+++ b/modules/hal/include/opencv2/hal/neon_utils.hpp
@@ -0,0 +1,127 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_NEON_UTILS_HPP__
+#define __OPENCV_HAL_NEON_UTILS_HPP__
+
+#include "opencv2/hal/defs.h"
+
+namespace cv {
+
+#if CV_NEON
+
+inline int32x2_t cv_vrnd_s32_f32(float32x2_t v)
+{
+    static int32x2_t v_sign = vdup_n_s32(1 << 31),
+        v_05 = vreinterpret_s32_f32(vdup_n_f32(0.5f));
+
+    int32x2_t v_addition = vorr_s32(v_05, vand_s32(v_sign, vreinterpret_s32_f32(v)));
+    return vcvt_s32_f32(vadd_f32(v, vreinterpret_f32_s32(v_addition)));
+}
+
+inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
+{
+    static int32x4_t v_sign = vdupq_n_s32(1 << 31),
+        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));
+
+    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
+    return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
+}
+
+inline uint32x2_t cv_vrnd_u32_f32(float32x2_t v)
+{
+    static float32x2_t v_05 = vdup_n_f32(0.5f);
+    return vcvt_u32_f32(vadd_f32(v, v_05));
+}
+
+inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v)
+{
+    static float32x4_t v_05 = vdupq_n_f32(0.5f);
+    return vcvtq_u32_f32(vaddq_f32(v, v_05));
+}
+
+inline float32x4_t cv_vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x2_t cv_vrecp_f32(float32x2_t val)
+{
+    float32x2_t reciprocal = vrecpe_f32(val);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+inline float32x4_t cv_vrsqrtq_f32(float32x4_t val)
+{
+    float32x4_t e = vrsqrteq_f32(val);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
+{
+    float32x2_t e = vrsqrte_f32(val);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
+    return e;
+}
+
+inline float32x4_t cv_vsqrtq_f32(float32x4_t val)
+{
+    return cv_vrecpq_f32(cv_vrsqrtq_f32(val));
+}
+
+inline float32x2_t cv_vsqrt_f32(float32x2_t val)
+{
+    return cv_vrecp_f32(cv_vrsqrt_f32(val));
+}
+
+#endif
+
+}
+
+#endif // __OPENCV_HAL_NEON_UTILS_HPP__
diff --git a/modules/core/include/opencv2/core/sse_utils.hpp b/modules/hal/include/opencv2/hal/sse_utils.hpp
similarity index 99%
rename from modules/core/include/opencv2/core/sse_utils.hpp
rename to modules/hal/include/opencv2/hal/sse_utils.hpp
index e0283eb3f33d911e782e86dd3044b93d730ae4e7..9ce4098bad6f49473bcdc886c881d5fe6d7e895f 100644
--- a/modules/core/include/opencv2/core/sse_utils.hpp
+++ b/modules/hal/include/opencv2/hal/sse_utils.hpp
@@ -46,6 +46,8 @@
 #  error sse_utils.hpp header must be compiled as C++
 #endif
 
+#include "opencv2/hal/defs.h"
+
 #if CV_SSE2
 
 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
diff --git a/modules/hal/samples/simple_hal/CMakeLists.txt b/modules/hal/samples/simple_hal/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dd0be70f2fa7a334916a3a83edbd3c92015689a2
--- /dev/null
+++ b/modules/hal/samples/simple_hal/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR)
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  endif()
+endif()
+
+add_library(simple_hal simple.cpp)
+set(OPENCV_HAL_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../..")
+target_include_directories(simple_hal PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${OPENCV_HAL_DIR}/include)
diff --git a/modules/hal/samples/simple_hal/simple.cpp b/modules/hal/samples/simple_hal/simple.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..564a611a5a1f0b07a01baf94baf752ca5309ad28
--- /dev/null
+++ b/modules/hal/samples/simple_hal/simple.cpp
@@ -0,0 +1,33 @@
+#include "simple.hpp"
+
+int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
+        for(int x = 0 ; x < width; x++ )
+            dst[x] = src1[x] & src2[x];
+    return cv::hal::Error::Ok;
+}
+
+int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
+        for(int x = 0 ; x < width; x++ )
+            dst[x] = src1[x] | src2[x];
+    return cv::hal::Error::Ok;
+}
+
+int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
+        for(int x = 0 ; x < width; x++ )
+            dst[x] = src1[x] ^ src2[x];
+    return cv::hal::Error::Ok;
+}
+
+int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height)
+{
+    for(; height--; src1 = src1 + step1, src2 = src2 + step2, dst = dst + step)
+        for(int x = 0 ; x < width; x++ )
+            dst[x] = ~src1[x];
+    return cv::hal::Error::Ok;
+}
diff --git a/modules/hal/samples/simple_hal/simple.hpp b/modules/hal/samples/simple_hal/simple.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..85a16535de798bc31ad3985319186363ef15af84
--- /dev/null
+++ b/modules/hal/samples/simple_hal/simple.hpp
@@ -0,0 +1,20 @@
+#ifndef _SIMPLE_HPP_INCLUDED_
+#define _SIMPLE_HPP_INCLUDED_
+
+#include "opencv2/hal/interface.hpp"
+
+int slow_and8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+int slow_or8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+int slow_xor8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+int slow_not8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2, uchar* dst, size_t step, int width, int height);
+
+#undef hal_and8u
+#define hal_and8u slow_and8u
+#undef hal_or8u
+#define hal_or8u slow_or8u
+#undef hal_xor8u
+#define hal_xor8u slow_xor8u
+#undef hal_not8u
+#define hal_not8u slow_not8u
+
+#endif
diff --git a/modules/hal/src/arithm.cpp b/modules/hal/src/arithm.cpp
index a3f69facca087baa4149132fc7e457a2e1833847..e30cd7d9e5e759d887dba4cbce6a6dfe0cea3ea8 100644
--- a/modules/hal/src/arithm.cpp
+++ b/modules/hal/src/arithm.cpp
@@ -7,11 +7,13 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -41,7 +43,1089 @@
 //M*/
 
 #include "precomp.hpp"
+#include "arithm_simd.hpp"
+#include "arithm_core.hpp"
+#include "replacement.hpp"
 
 namespace cv { namespace hal {
 
-}}
+//=======================================
+
+#undef CALL_HAL
+#define CALL_HAL(fun) \
+    int res = fun(src1, step1, src2, step2, dst, step, width, height); \
+    if (res == Error::Ok) \
+        return; \
+    else if (res != Error::NotImplemented) \
+        throw Failure(res);
+
+#if (ARITHM_USE_IPP == 1)
+static inline void fixSteps(width, height, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
+{
+    if( height == 1 )
+        step1 = step2 = step = width*elemSize;
+}
+#define CALL_IPP_BIN_12(fun) \
+    CV_IPP_CHECK() \
+    { \
+        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
+        if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0)) \
+        { \
+            CV_IMPL_ADD(CV_IMPL_IPP); \
+            return; \
+        } \
+        setIppErrorStatus(); \
+    }
+#else
+#define CALL_IPP_BIN_12(fun)
+#endif
+
+//=======================================
+// Add
+//=======================================
+
+void add8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add8u)
+    CALL_IPP_BIN_12(ippiAdd_8u_C1RSfs)
+    (vBinOp<uchar, cv::OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void add8s( const schar* src1, size_t step1,
+                   const schar* src2, size_t step2,
+                   schar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add8s)
+    vBinOp<schar, cv::OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void add16u( const ushort* src1, size_t step1,
+                    const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add16u)
+    CALL_IPP_BIN_12(ippiAdd_16u_C1RSfs)
+    (vBinOp<ushort, cv::OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void add16s( const short* src1, size_t step1,
+                    const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add16s)
+    CALL_IPP_BIN_12(ippiAdd_16s_C1RSfs)
+    (vBinOp<short, cv::OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void add32s( const int* src1, size_t step1,
+                    const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add32s)
+    vBinOp32<int, cv::OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void add32f( const float* src1, size_t step1,
+                    const float* src2, size_t step2,
+                    float* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add32f)
+    CALL_IPP_BIN_12(ippiAdd_32f_C1R)
+    (vBinOp32<float, cv::OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void add64f( const double* src1, size_t step1,
+                    const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_add64f)
+    vBinOp64<double, cv::OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+//=======================================
+
+#if (ARITHM_USE_IPP == 1)
+#define CALL_IPP_BIN_21(fun) \
+    CV_IPP_CHECK() \
+    { \
+        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
+        if (0 <= fun(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(width, height), 0)) \
+        { \
+            CV_IMPL_ADD(CV_IMPL_IPP); \
+            return; \
+        } \
+        setIppErrorStatus(); \
+    }
+#else
+#define CALL_IPP_BIN_21(fun)
+#endif
+
+//=======================================
+// Subtract
+//=======================================
+
+void sub8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub8u)
+    CALL_IPP_BIN_21(ippiSub_8u_C1RSfs)
+    (vBinOp<uchar, cv::OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void sub8s( const schar* src1, size_t step1,
+                   const schar* src2, size_t step2,
+                   schar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub8s)
+    vBinOp<schar, cv::OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void sub16u( const ushort* src1, size_t step1,
+                    const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub16u)
+    CALL_IPP_BIN_21(ippiSub_16u_C1RSfs)
+    (vBinOp<ushort, cv::OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void sub16s( const short* src1, size_t step1,
+                    const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub16s)
+    CALL_IPP_BIN_21(ippiSub_16s_C1RSfs)
+    (vBinOp<short, cv::OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void sub32s( const int* src1, size_t step1,
+                    const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub32s)
+    vBinOp32<int, cv::OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void sub32f( const float* src1, size_t step1,
+                   const float* src2, size_t step2,
+                   float* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub32f)
+    CALL_IPP_BIN_21(ippiSub_32f_C1R)
+    (vBinOp32<float, cv::OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void sub64f( const double* src1, size_t step1,
+                    const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_sub64f)
+    vBinOp64<double, cv::OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+//=======================================
+
+#if (ARITHM_USE_IPP == 1)
+#define CALL_IPP_MIN_MAX(fun, type) \
+    CV_IPP_CHECK() \
+    { \
+        type* s1 = (type*)src1; \
+        type* s2 = (type*)src2; \
+        type* d  = dst; \
+        fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
+        int i = 0; \
+        for(; i < height; i++) \
+        { \
+            if (0 > fun(s1, s2, d, width)) \
+                break; \
+            s1 = (type*)((uchar*)s1 + step1); \
+            s2 = (type*)((uchar*)s2 + step2); \
+            d  = (type*)((uchar*)d + step); \
+        } \
+        if (i == height) \
+        { \
+            CV_IMPL_ADD(CV_IMPL_IPP); \
+            return; \
+        } \
+        setIppErrorStatus(); \
+    }
+#else
+#define CALL_IPP_MIN_MAX(fun, type)
+#endif
+
+//=======================================
+// Max
+//=======================================
+
+void max8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max8u)
+    CALL_IPP_MIN_MAX(ippsMaxEvery_8u, uchar)
+    vBinOp<uchar, cv::OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max8s( const schar* src1, size_t step1,
+                   const schar* src2, size_t step2,
+                   schar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max8s)
+    vBinOp<schar, cv::OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max16u( const ushort* src1, size_t step1,
+                    const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max16u)
+    CALL_IPP_MIN_MAX(ippsMaxEvery_16u, ushort)
+    vBinOp<ushort, cv::OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max16s( const short* src1, size_t step1,
+                    const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max16s)
+    vBinOp<short, cv::OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max32s( const int* src1, size_t step1,
+                    const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max32s)
+    vBinOp32<int, cv::OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max32f( const float* src1, size_t step1,
+                    const float* src2, size_t step2,
+                    float* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max32f)
+    CALL_IPP_MIN_MAX(ippsMaxEvery_32f, float)
+    vBinOp32<float, cv::OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void max64f( const double* src1, size_t step1,
+                    const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_max64f)
+    CALL_IPP_MIN_MAX(ippsMaxEvery_64f, double)
+    vBinOp64<double, cv::OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+//=======================================
+// Min
+//=======================================
+
+void min8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min8u)
+    CALL_IPP_MIN_MAX(ippsMinEvery_8u, uchar)
+    vBinOp<uchar, cv::OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min8s( const schar* src1, size_t step1,
+                   const schar* src2, size_t step2,
+                   schar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min8s)
+    vBinOp<schar, cv::OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min16u( const ushort* src1, size_t step1,
+                    const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min16u)
+    CALL_IPP_MIN_MAX(ippsMinEvery_16u, ushort)
+    vBinOp<ushort, cv::OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min16s( const short* src1, size_t step1,
+                    const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min16s)
+    vBinOp<short, cv::OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min32s( const int* src1, size_t step1,
+                    const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min32s)
+    vBinOp32<int, cv::OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min32f( const float* src1, size_t step1,
+                    const float* src2, size_t step2,
+                    float* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min32f)
+    CALL_IPP_MIN_MAX(ippsMinEvery_32f, float)
+    vBinOp32<float, cv::OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void min64f( const double* src1, size_t step1,
+                    const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_min64f)
+    CALL_IPP_MIN_MAX(ippsMinEvery_64f, double)
+    vBinOp64<double, cv::OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+//=======================================
+// AbsDiff
+//=======================================
+
+void absdiff8u( const uchar* src1, size_t step1,
+                       const uchar* src2, size_t step2,
+                       uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff8u)
+    CALL_IPP_BIN_12(ippiAbsDiff_8u_C1R)
+    (vBinOp<uchar, cv::OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void absdiff8s( const schar* src1, size_t step1,
+                       const schar* src2, size_t step2,
+                       schar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff8s)
+    vBinOp<schar, cv::OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void absdiff16u( const ushort* src1, size_t step1,
+                        const ushort* src2, size_t step2,
+                        ushort* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff16u)
+    CALL_IPP_BIN_12(ippiAbsDiff_16u_C1R)
+    (vBinOp<ushort, cv::OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void absdiff16s( const short* src1, size_t step1,
+                        const short* src2, size_t step2,
+                        short* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff16s)
+    vBinOp<short, cv::OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void absdiff32s( const int* src1, size_t step1,
+                        const int* src2, size_t step2,
+                        int* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff32s)
+    vBinOp32<int, cv::OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+void absdiff32f( const float* src1, size_t step1,
+                        const float* src2, size_t step2,
+                        float* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff32f)
+    CALL_IPP_BIN_12(ippiAbsDiff_32f_C1R)
+    (vBinOp32<float, cv::OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void absdiff64f( const double* src1, size_t step1,
+                        const double* src2, size_t step2,
+                        double* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_absdiff64f)
+    vBinOp64<double, cv::OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, width, height);
+}
+
+//=======================================
+// Logical
+//=======================================
+
+void and8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_and8u)
+    CALL_IPP_BIN_12(ippiAnd_8u_C1R)
+    (vBinOp<uchar, cv::OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void or8u( const uchar* src1, size_t step1,
+                  const uchar* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_or8u)
+    CALL_IPP_BIN_12(ippiOr_8u_C1R)
+    (vBinOp<uchar, cv::OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void xor8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_xor8u)
+    CALL_IPP_BIN_12(ippiXor_8u_C1R)
+    (vBinOp<uchar, cv::OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+void not8u( const uchar* src1, size_t step1,
+                   const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* )
+{
+    CALL_HAL(hal_not8u)
+    CALL_IPP_BIN_12(ippiNot_8u_C1R)
+    (vBinOp<uchar, cv::OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, width, height));
+}
+
+//=======================================
+
+#undef CALL_HAL
+#define CALL_HAL(fun) \
+    int res = fun(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop); \
+    if (res == Error::Ok) \
+        return; \
+    else if (res != Error::NotImplemented) \
+        throw Failure(res);
+
+#if ARITHM_USE_IPP
+inline static IppCmpOp convert_cmp(int _cmpop)
+{
+    return _cmpop == CMP_EQ ? ippCmpEq :
+        _cmpop == CMP_GT ? ippCmpGreater :
+        _cmpop == CMP_GE ? ippCmpGreaterEq :
+        _cmpop == CMP_LT ? ippCmpLess :
+        _cmpop == CMP_LE ? ippCmpLessEq :
+        (IppCmpOp)-1;
+}
+#define CALL_IPP_CMP(fun) \
+    CV_IPP_CHECK() \
+    { \
+        IppCmpOp op = convert_cmp(*(int *)_cmpop); \
+        if( op  >= 0 ) \
+        { \
+            fixSteps(width, height, sizeof(dst[0]), step1, step2, step); \
+            if (0 <= fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), op)) \
+            { \
+                CV_IMPL_ADD(CV_IMPL_IPP); \
+                return; \
+            } \
+            setIppErrorStatus(); \
+        } \
+    }
+#else
+#define CALL_IPP_CMP(fun)
+#endif
+
+//=======================================
+// Compare
+//=======================================
+
+void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp8u)
+    CALL_IPP_CMP(ippiCompare_8u_C1R)
+  //vz optimized  cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+    int code = *(int*)_cmpop;
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    if( code == CMP_GE || code == CMP_LT )
+    {
+        std::swap(src1, src2);
+        std::swap(step1, step2);
+        code = code == CMP_GE ? CMP_LE : CMP_GT;
+    }
+
+    if( code == CMP_GT || code == CMP_LE )
+    {
+        int m = code == CMP_GT ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x =0;
+            #if CV_SSE2
+            if( USE_SSE2 )
+            {
+                __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
+                __m128i c128 = _mm_set1_epi8 (-128);
+                for( ; x <= width - 16; x += 16 )
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    // no simd for 8u comparison, that's why we need the trick
+                    r00 = _mm_sub_epi8(r00,c128);
+                    r10 = _mm_sub_epi8(r10,c128);
+
+                    r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
+                    _mm_storeu_si128((__m128i*)(dst + x),r00);
+
+                }
+            }
+            #elif CV_NEON
+            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= width - 16; x += 16 )
+            {
+                vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
+            }
+
+           #endif
+
+            for( ; x < width; x++ ){
+                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+            }
+        }
+    }
+    else if( code == CMP_EQ || code == CMP_NE )
+    {
+        int m = code == CMP_EQ ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = 0;
+            #if CV_SSE2
+            if( USE_SSE2 )
+            {
+                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
+                for( ; x <= width - 16; x += 16 )
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
+                    _mm_storeu_si128((__m128i*)(dst + x), r00);
+                }
+            }
+            #elif CV_NEON
+            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= width - 16; x += 16 )
+            {
+                vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
+            }
+           #endif
+           for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+        }
+    }
+}
+
+void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp8s)
+    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+}
+
+void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp16u)
+    CALL_IPP_CMP(ippiCompare_16u_C1R)
+    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+}
+
+void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp16s)
+    CALL_IPP_CMP(ippiCompare_16s_C1R)
+   //vz optimized cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+
+    int code = *(int*)_cmpop;
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    if( code == CMP_GE || code == CMP_LT )
+    {
+        std::swap(src1, src2);
+        std::swap(step1, step2);
+        code = code == CMP_GE ? CMP_LE : CMP_GT;
+    }
+
+    if( code == CMP_GT || code == CMP_LE )
+    {
+        int m = code == CMP_GT ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x =0;
+            #if CV_SSE2
+            if( USE_SSE2)
+            {
+                __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
+                for( ; x <= width - 16; x += 16 )
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
+                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
+                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
+                    r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
+                    r11 = _mm_packs_epi16(r00, r01);
+                    _mm_storeu_si128((__m128i*)(dst + x), r11);
+                }
+                if( x <= width-8)
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
+                    r10 = _mm_packs_epi16(r00, r00);
+                    _mm_storel_epi64((__m128i*)(dst + x), r10);
+
+                    x += 8;
+                }
+            }
+            #elif CV_NEON
+            uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= width - 16; x += 16 )
+            {
+                int16x8_t in1 = vld1q_s16(src1 + x);
+                int16x8_t in2 = vld1q_s16(src2 + x);
+                uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
+
+                in1 = vld1q_s16(src1 + x + 8);
+                in2 = vld1q_s16(src2 + x + 8);
+                uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
+
+                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
+            }
+            #endif
+
+            for( ; x < width; x++ ){
+                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+            }
+        }
+    }
+    else if( code == CMP_EQ || code == CMP_NE )
+    {
+        int m = code == CMP_EQ ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = 0;
+            #if CV_SSE2
+            if( USE_SSE2 )
+            {
+                __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
+                for( ; x <= width - 16; x += 16 )
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
+                    __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
+                    __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
+                    r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
+                    r11 = _mm_packs_epi16(r00, r01);
+                    _mm_storeu_si128((__m128i*)(dst + x), r11);
+                }
+                if( x <= width - 8)
+                {
+                    __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
+                    __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
+                    r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
+                    r10 = _mm_packs_epi16(r00, r00);
+                    _mm_storel_epi64((__m128i*)(dst + x), r10);
+
+                    x += 8;
+                }
+            }
+            #elif CV_NEON
+            uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
+
+            for( ; x <= width - 16; x += 16 )
+            {
+                int16x8_t in1 = vld1q_s16(src1 + x);
+                int16x8_t in2 = vld1q_s16(src2 + x);
+                uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
+
+                in1 = vld1q_s16(src1 + x + 8);
+                in2 = vld1q_s16(src2 + x + 8);
+                uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
+
+                vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
+            }
+            #endif
+            for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+        }
+    }
+}
+
+void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp32s)
+    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+}
+
+void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp32f)
+    CALL_IPP_CMP(ippiCompare_32f_C1R)
+    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+}
+
+void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* _cmpop)
+{
+    CALL_HAL(hal_cmp64f)
+    cmp_(src1, step1, src2, step2, dst, step, width, height, *(int*)_cmpop);
+}
+
+//=======================================
+
+#undef CALL_HAL
+#define CALL_HAL(fun) \
+    int res = fun(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale); \
+    if (res == Error::Ok) \
+        return; \
+    else if (res != Error::NotImplemented) \
+        throw Failure(res);
+
+#if defined HAVE_IPP
+#define CALL_IPP_MUL(fun) \
+    CV_IPP_CHECK() \
+    { \
+        if (std::fabs(fscale - 1) <= FLT_EPSILON) \
+        { \
+            if (fun(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(width, height), 0) >= 0) \
+            { \
+                CV_IMPL_ADD(CV_IMPL_IPP); \
+                return; \
+            } \
+            setIppErrorStatus(); \
+        } \
+    }
+#else
+#define CALL_IPP_MUL(fun)
+#endif
+
+//=======================================
+// Multilpy
+//=======================================
+
+void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul8u)
+    float fscale = (float)*(const double*)scale;
+    CALL_IPP_MUL(ippiMul_8u_C1RSfs)
+    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
+}
+
+void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
+                   schar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul8s)
+    mul_(src1, step1, src2, step2, dst, step, width, height, (float)*(const double*)scale);
+}
+
+void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul16u)
+    float fscale = (float)*(const double*)scale;
+    CALL_IPP_MUL(ippiMul_16u_C1RSfs)
+    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
+}
+
+void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul16s)
+    float fscale = (float)*(const double*)scale;
+    CALL_IPP_MUL(ippiMul_16s_C1RSfs)
+    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
+}
+
+void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul32s)
+    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
+                    float* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul32f)
+    float fscale = (float)*(const double*)scale;
+    CALL_IPP_MUL(ippiMul_32f_C1R)
+    mul_(src1, step1, src2, step2, dst, step, width, height, fscale);
+}
+
+void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_mul64f)
+    mul_(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+//=======================================
+// Divide
+//=======================================
+
+void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                   uchar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div8u)
+    if( src1 )
+        div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+    else
+        recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
+                  schar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div8s)
+    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                    ushort* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div16u)
+    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
+                    short* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div16s)
+    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
+                    int* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div32s)
+    div_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
+                    float* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div32f)
+    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
+                    double* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_div64f)
+    div_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+//=======================================
+// Reciprocial
+//=======================================
+
+void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
+                  uchar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip8u)
+    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
+                  schar* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip8s)
+    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                   ushort* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip16u)
+    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
+                   short* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip16s)
+    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
+                   int* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip32s)
+    recip_i(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
+                   float* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip32f)
+    recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
+                   double* dst, size_t step, int width, int height, void* scale)
+{
+    CALL_HAL(hal_recip64f)
+    recip_f(src1, step1, src2, step2, dst, step, width, height, *(const double*)scale);
+}
+
+//=======================================
+
+#undef CALL_HAL
+#define CALL_HAL(fun) \
+    int res = fun(src1, step1, src2, step2, dst, step, width, height, scalars); \
+    if (res == Error::Ok) \
+        return; \
+    else if (res != Error::NotImplemented) \
+        throw Failure(res);
+
+//=======================================
+// Add weighted
+//=======================================
+
+void
+addWeighted8u( const uchar* src1, size_t step1,
+               const uchar* src2, size_t step2,
+               uchar* dst, size_t step, int width, int height,
+               void* scalars )
+{
+    CALL_HAL(hal_addWeighted8u)
+    const double* scalars_ = (const double*)scalars;
+    float alpha = (float)scalars_[0], beta = (float)scalars_[1], gamma = (float)scalars_[2];
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int x = 0;
+
+#if CV_SSE2
+        if( USE_SSE2 )
+        {
+            __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
+            __m128i z = _mm_setzero_si128();
+
+            for( ; x <= width - 8; x += 8 )
+            {
+                __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
+                __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
+
+                __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
+                __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
+                __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
+                __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
+
+                u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
+                u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
+                u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
+
+                u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
+                u = _mm_packus_epi16(u, u);
+
+                _mm_storel_epi64((__m128i*)(dst + x), u);
+            }
+        }
+#elif CV_NEON
+        float32x4_t g = vdupq_n_f32 (gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            uint8x8_t in1 = vld1_u8(src1+x);
+            uint16x8_t in1_16 = vmovl_u8(in1);
+            float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
+            float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
+
+            uint8x8_t in2 = vld1_u8(src2+x);
+            uint16x8_t in2_16 = vmovl_u8(in2);
+            float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
+            float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
+
+            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
+            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
+            out_f_l = vaddq_f32(out_f_l, g);
+            out_f_h = vaddq_f32(out_f_h, g);
+
+            uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
+            uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
+
+            uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
+            uint8x8_t out = vqmovn_u16(out_16);
+
+            vst1_u8(dst+x, out);
+        }
+#endif
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            float t0, t1;
+            t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
+            t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
+
+            dst[x] = saturate_cast<uchar>(t0);
+            dst[x+1] = saturate_cast<uchar>(t1);
+
+            t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
+            t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
+
+            dst[x+2] = saturate_cast<uchar>(t0);
+            dst[x+3] = saturate_cast<uchar>(t1);
+        }
+        #endif
+
+        for( ; x < width; x++ )
+        {
+            float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
+            dst[x] = saturate_cast<uchar>(t0);
+        }
+    }
+}
+
+void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
+                           schar* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted8s)
+    addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
+                            ushort* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted16u)
+    addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
+                            short* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted16s)
+    addWeighted_<short, float>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
+                            int* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted32s)
+    addWeighted_<int, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
+                            float* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted32f)
+    addWeighted_<float, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
+                            double* dst, size_t step, int width, int height, void* scalars )
+{
+    CALL_HAL(hal_addWeighted64f)
+    addWeighted_<double, double>(src1, step1, src2, step2, dst, step, width, height, scalars);
+}
+
+}} // cv::hal::
diff --git a/modules/hal/src/arithm_core.hpp b/modules/hal/src/arithm_core.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a65e74c3812623111318fedd7c4f46a99e6dbc65
--- /dev/null
+++ b/modules/hal/src/arithm_core.hpp
@@ -0,0 +1,657 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_ARITHM_CORE_HPP__
+#define __OPENCV_HAL_ARITHM_CORE_HPP__
+
+#include "arithm_simd.hpp"
+
+const uchar g_Saturate8u[] =
+{
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+     16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+     32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+     48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+     64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+     80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+     96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+    208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+    240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255
+};
+
+
+#define CV_FAST_CAST_8U(t)   (assert(-256 <= (t) && (t) <= 512), g_Saturate8u[(t)+256])
+#define CV_MIN_8U(a,b)       ((a) - CV_FAST_CAST_8U((a) - (b)))
+#define CV_MAX_8U(a,b)       ((a) + CV_FAST_CAST_8U((b) - (a)))
+
+const float g_8x32fTab[] =
+{
+    -128.f, -127.f, -126.f, -125.f, -124.f, -123.f, -122.f, -121.f,
+    -120.f, -119.f, -118.f, -117.f, -116.f, -115.f, -114.f, -113.f,
+    -112.f, -111.f, -110.f, -109.f, -108.f, -107.f, -106.f, -105.f,
+    -104.f, -103.f, -102.f, -101.f, -100.f,  -99.f,  -98.f,  -97.f,
+     -96.f,  -95.f,  -94.f,  -93.f,  -92.f,  -91.f,  -90.f,  -89.f,
+     -88.f,  -87.f,  -86.f,  -85.f,  -84.f,  -83.f,  -82.f,  -81.f,
+     -80.f,  -79.f,  -78.f,  -77.f,  -76.f,  -75.f,  -74.f,  -73.f,
+     -72.f,  -71.f,  -70.f,  -69.f,  -68.f,  -67.f,  -66.f,  -65.f,
+     -64.f,  -63.f,  -62.f,  -61.f,  -60.f,  -59.f,  -58.f,  -57.f,
+     -56.f,  -55.f,  -54.f,  -53.f,  -52.f,  -51.f,  -50.f,  -49.f,
+     -48.f,  -47.f,  -46.f,  -45.f,  -44.f,  -43.f,  -42.f,  -41.f,
+     -40.f,  -39.f,  -38.f,  -37.f,  -36.f,  -35.f,  -34.f,  -33.f,
+     -32.f,  -31.f,  -30.f,  -29.f,  -28.f,  -27.f,  -26.f,  -25.f,
+     -24.f,  -23.f,  -22.f,  -21.f,  -20.f,  -19.f,  -18.f,  -17.f,
+     -16.f,  -15.f,  -14.f,  -13.f,  -12.f,  -11.f,  -10.f,   -9.f,
+      -8.f,   -7.f,   -6.f,   -5.f,   -4.f,   -3.f,   -2.f,   -1.f,
+       0.f,    1.f,    2.f,    3.f,    4.f,    5.f,    6.f,    7.f,
+       8.f,    9.f,   10.f,   11.f,   12.f,   13.f,   14.f,   15.f,
+      16.f,   17.f,   18.f,   19.f,   20.f,   21.f,   22.f,   23.f,
+      24.f,   25.f,   26.f,   27.f,   28.f,   29.f,   30.f,   31.f,
+      32.f,   33.f,   34.f,   35.f,   36.f,   37.f,   38.f,   39.f,
+      40.f,   41.f,   42.f,   43.f,   44.f,   45.f,   46.f,   47.f,
+      48.f,   49.f,   50.f,   51.f,   52.f,   53.f,   54.f,   55.f,
+      56.f,   57.f,   58.f,   59.f,   60.f,   61.f,   62.f,   63.f,
+      64.f,   65.f,   66.f,   67.f,   68.f,   69.f,   70.f,   71.f,
+      72.f,   73.f,   74.f,   75.f,   76.f,   77.f,   78.f,   79.f,
+      80.f,   81.f,   82.f,   83.f,   84.f,   85.f,   86.f,   87.f,
+      88.f,   89.f,   90.f,   91.f,   92.f,   93.f,   94.f,   95.f,
+      96.f,   97.f,   98.f,   99.f,  100.f,  101.f,  102.f,  103.f,
+     104.f,  105.f,  106.f,  107.f,  108.f,  109.f,  110.f,  111.f,
+     112.f,  113.f,  114.f,  115.f,  116.f,  117.f,  118.f,  119.f,
+     120.f,  121.f,  122.f,  123.f,  124.f,  125.f,  126.f,  127.f,
+     128.f,  129.f,  130.f,  131.f,  132.f,  133.f,  134.f,  135.f,
+     136.f,  137.f,  138.f,  139.f,  140.f,  141.f,  142.f,  143.f,
+     144.f,  145.f,  146.f,  147.f,  148.f,  149.f,  150.f,  151.f,
+     152.f,  153.f,  154.f,  155.f,  156.f,  157.f,  158.f,  159.f,
+     160.f,  161.f,  162.f,  163.f,  164.f,  165.f,  166.f,  167.f,
+     168.f,  169.f,  170.f,  171.f,  172.f,  173.f,  174.f,  175.f,
+     176.f,  177.f,  178.f,  179.f,  180.f,  181.f,  182.f,  183.f,
+     184.f,  185.f,  186.f,  187.f,  188.f,  189.f,  190.f,  191.f,
+     192.f,  193.f,  194.f,  195.f,  196.f,  197.f,  198.f,  199.f,
+     200.f,  201.f,  202.f,  203.f,  204.f,  205.f,  206.f,  207.f,
+     208.f,  209.f,  210.f,  211.f,  212.f,  213.f,  214.f,  215.f,
+     216.f,  217.f,  218.f,  219.f,  220.f,  221.f,  222.f,  223.f,
+     224.f,  225.f,  226.f,  227.f,  228.f,  229.f,  230.f,  231.f,
+     232.f,  233.f,  234.f,  235.f,  236.f,  237.f,  238.f,  239.f,
+     240.f,  241.f,  242.f,  243.f,  244.f,  245.f,  246.f,  247.f,
+     248.f,  249.f,  250.f,  251.f,  252.f,  253.f,  254.f,  255.f
+};
+
+#define CV_8TO32F(x)  g_8x32fTab[(x)+128]
+
+namespace cv {
+
+template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a + b); }
+
+template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
+{ return CV_FAST_CAST_8U(a - b); }
+
+template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
+{ return saturate_cast<short>(std::abs(a - b)); }
+
+template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
+{ return saturate_cast<schar>(std::abs(a - b)); }
+
+template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
+
+template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
+
+}
+
+namespace cv { namespace hal {
+
+template<typename T, class Op, class VOp>
+void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2 || CV_NEON
+    VOp vop;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
+#if CV_SSE2
+        if( USE_SSE2 )
+        {
+#endif // CV_SSE2
+            for( ; x <= width - 32/(int)sizeof(T); x += 32/sizeof(T) )
+            {
+                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
+                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
+                r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
+                r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
+                VLoadStore128<T>::store(dst + x               , r0);
+                VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
+            }
+#if CV_SSE2
+        }
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
+#if CV_AVX2
+        // nothing
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            for( ; x <= width - 8/(int)sizeof(T); x += 8/sizeof(T) )
+            {
+                typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
+                r = vop(r, VLoadStore64<T>::load(src2 + x));
+                VLoadStore64<T>::store(dst + x, r);
+            }
+        }
+#endif
+
+#if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+#endif
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+template<typename T, class Op, class Op32>
+void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
+              T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2 || CV_NEON
+    Op32 op32;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= width - 8; x += 8 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
+            {
+                for( ; x <= width - 8; x += 8 )
+                {
+                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
+                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
+                    r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
+                    r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
+                    VLoadStore128Aligned<T>::store(dst + x    , r0);
+                    VLoadStore128Aligned<T>::store(dst + x + 4, r1);
+                }
+            }
+        }
+#endif // CV_AVX2
+
+#if CV_NEON || CV_SSE2
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            for( ; x <= width - 8; x += 8 )
+            {
+                typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
+                r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
+                VLoadStore256<T>::store(dst + x, r0);
+            }
+        }
+#else
+#if CV_SSE2
+        if( USE_SSE2 )
+        {
+#endif // CV_SSE2
+            for( ; x <= width - 8; x += 8 )
+            {
+                typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
+                typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
+                r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
+                r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
+                VLoadStore128<T>::store(dst + x    , r0);
+                VLoadStore128<T>::store(dst + x + 4, r1);
+            }
+#if CV_SSE2
+        }
+#endif // CV_SSE2
+#endif // CV_AVX2
+#endif // CV_NEON || CV_SSE2
+
+#if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+#endif
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+
+template<typename T, class Op, class Op64>
+void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
+               T* dst, size_t step, int width, int height)
+{
+#if CV_SSE2
+    Op64 op64;
+#endif
+    Op op;
+
+    for( ; height--; src1 = (const T *)((const uchar *)src1 + step1),
+                        src2 = (const T *)((const uchar *)src2 + step2),
+                        dst = (T *)((uchar *)dst + step) )
+    {
+        int x = 0;
+
+#if CV_AVX2
+        if( USE_AVX2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
+            {
+                for( ; x <= width - 4; x += 4 )
+                {
+                    typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
+                    r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
+                    VLoadStore256Aligned<T>::store(dst + x, r0);
+                }
+            }
+        }
+#elif CV_SSE2
+        if( USE_SSE2 )
+        {
+            if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
+            {
+                for( ; x <= width - 4; x += 4 )
+                {
+                    typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
+                    typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
+                    r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
+                    r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
+                    VLoadStore128Aligned<T>::store(dst + x    , r0);
+                    VLoadStore128Aligned<T>::store(dst + x + 2, r1);
+                }
+            }
+        }
+#endif
+
+        for( ; x <= width - 4; x += 4 )
+        {
+            T v0 = op(src1[x], src2[x]);
+            T v1 = op(src1[x+1], src2[x+1]);
+            dst[x] = v0; dst[x+1] = v1;
+            v0 = op(src1[x+2], src2[x+2]);
+            v1 = op(src1[x+3], src2[x+3]);
+            dst[x+2] = v0; dst[x+3] = v1;
+        }
+
+        for( ; x < width; x++ )
+            dst[x] = op(src1[x], src2[x]);
+    }
+}
+
+template<typename T> static void
+cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
+     uchar* dst, size_t step, int width, int height, int code)
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    if( code == CMP_GE || code == CMP_LT )
+    {
+        std::swap(src1, src2);
+        std::swap(step1, step2);
+        code = code == CMP_GE ? CMP_LE : CMP_GT;
+    }
+
+    Cmp_SIMD<T> vop(code);
+
+    if( code == CMP_GT || code == CMP_LE )
+    {
+        int m = code == CMP_GT ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = vop(src1, src2, dst, width);
+            #if CV_ENABLE_UNROLLED
+            for( ; x <= width - 4; x += 4 )
+            {
+                int t0, t1;
+                t0 = -(src1[x] > src2[x]) ^ m;
+                t1 = -(src1[x+1] > src2[x+1]) ^ m;
+                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
+                t0 = -(src1[x+2] > src2[x+2]) ^ m;
+                t1 = -(src1[x+3] > src2[x+3]) ^ m;
+                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
+            }
+            #endif
+            for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
+        }
+    }
+    else if( code == CMP_EQ || code == CMP_NE )
+    {
+        int m = code == CMP_EQ ? 0 : 255;
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int x = 0;
+            #if CV_ENABLE_UNROLLED
+            for( ; x <= width - 4; x += 4 )
+            {
+                int t0, t1;
+                t0 = -(src1[x] == src2[x]) ^ m;
+                t1 = -(src1[x+1] == src2[x+1]) ^ m;
+                dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
+                t0 = -(src1[x+2] == src2[x+2]) ^ m;
+                t1 = -(src1[x+3] == src2[x+3]) ^ m;
+                dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
+            }
+            #endif
+            for( ; x < width; x++ )
+                dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
+        }
+    }
+}
+
+template<typename T, typename WT> static void
+mul_( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, WT scale )
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Mul_SIMD<T, WT> vop;
+
+    if( scale == (WT)1. )
+    {
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i = vop(src1, src2, dst, width, scale);
+            #if CV_ENABLE_UNROLLED
+            for(; i <= width - 4; i += 4 )
+            {
+                T t0;
+                T t1;
+                t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
+                t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
+                dst[i  ] = t0;
+                dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
+                t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
+                dst[i+2] = t0;
+                dst[i+3] = t1;
+            }
+            #endif
+            for( ; i < width; i++ )
+                dst[i] = saturate_cast<T>(src1[i] * src2[i]);
+        }
+    }
+    else
+    {
+        for( ; height--; src1 += step1, src2 += step2, dst += step )
+        {
+            int i = vop(src1, src2, dst, width, scale);
+            #if CV_ENABLE_UNROLLED
+            for(; i <= width - 4; i += 4 )
+            {
+                T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+                T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
+                dst[i] = t0; dst[i+1] = t1;
+
+                t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
+                t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
+                dst[i+2] = t0; dst[i+3] = t1;
+            }
+            #endif
+            for( ; i < width; i++ )
+                dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
+        }
+    }
+}
+
+
+template<typename T> static void
+div_i( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, double scale )
+{
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+div_f( const T* src1, size_t step1, const T* src2, size_t step2,
+      T* dst, size_t step, int width, int height, double scale )
+{
+    T scale_f = (T)scale;
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Div_SIMD<T> vop;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int i = vop(src1, src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T num = src1[i], denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_i( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, int width, int height, double scale )
+{
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Recip_SIMD<T> vop;
+    float scale_f = (float)scale;
+
+    for( ; height--; src2 += step2, dst += step )
+    {
+        int i = vop(src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T> static void
+recip_f( const T*, size_t, const T* src2, size_t step2,
+         T* dst, size_t step, int width, int height, double scale )
+{
+    T scale_f = (T)scale;
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    Recip_SIMD<T> vop;
+
+    for( ; height--; src2 += step2, dst += step )
+    {
+        int i = vop(src2, dst, width, scale);
+        for( ; i < width; i++ )
+        {
+            T denom = src2[i];
+            dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
+        }
+    }
+}
+
+template<typename T, typename WT> static void
+addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
+              T* dst, size_t step, int width, int height, void* _scalars )
+{
+    const double* scalars = (const double*)_scalars;
+    WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
+    step1 /= sizeof(src1[0]);
+    step2 /= sizeof(src2[0]);
+    step /= sizeof(dst[0]);
+
+    AddWeighted_SIMD<T, WT> vop;
+
+    for( ; height--; src1 += step1, src2 += step2, dst += step )
+    {
+        int x = vop(src1, src2, dst, width, alpha, beta, gamma);
+        #if CV_ENABLE_UNROLLED
+        for( ; x <= width - 4; x += 4 )
+        {
+            T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
+            T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
+            dst[x] = t0; dst[x+1] = t1;
+
+            t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
+            t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
+            dst[x+2] = t0; dst[x+3] = t1;
+        }
+        #endif
+        for( ; x < width; x++ )
+            dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
+    }
+}
+
+}} // cv::hal::
+
+
+#endif // __OPENCV_HAL_ARITHM_CORE_HPP__
diff --git a/modules/hal/src/arithm_simd.hpp b/modules/hal/src/arithm_simd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e4029875c23dd973b6df691e377705717d98664
--- /dev/null
+++ b/modules/hal/src/arithm_simd.hpp
@@ -0,0 +1,2025 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_ARITHM_SIMD_HPP__
+#define __OPENCV_HAL_ARITHM_SIMD_HPP__
+
+namespace cv { namespace hal {
+
+struct NOP {};
+
+#if CV_SSE2 || CV_NEON
+#define IF_SIMD(op) op
+#else
+#define IF_SIMD(op) NOP
+#endif
+
+
+#if CV_SSE2 || CV_NEON
+
+#define FUNCTOR_TEMPLATE(name)          \
+    template<typename T> struct name {}
+
+FUNCTOR_TEMPLATE(VLoadStore128);
+#if CV_SSE2
+FUNCTOR_TEMPLATE(VLoadStore64);
+FUNCTOR_TEMPLATE(VLoadStore128Aligned);
+#if CV_AVX2
+FUNCTOR_TEMPLATE(VLoadStore256);
+FUNCTOR_TEMPLATE(VLoadStore256Aligned);
+#endif
+#endif
+
+#endif
+
+#if CV_AVX2
+
+#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
+    template <>                                                                                  \
+    struct name<template_arg>{                                                                   \
+        typedef register_type reg_type;                                                          \
+        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
+        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
+    }
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
+    template <>                                                                     \
+    struct name<template_arg>{                                                      \
+        typedef register_type reg_type;                                             \
+        static reg_type load(const template_arg * p) { return load_body (p); }      \
+        static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore256<template_arg>::reg_type operator()(                      \
+                        const VLoadStore256<template_arg>::reg_type & a,       \
+                        const VLoadStore256<template_arg>::reg_type & b) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore256<template_arg>::reg_type operator()(                      \
+                        const VLoadStore256<template_arg>::reg_type & a,       \
+                        const VLoadStore256<template_arg>::reg_type &  ) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
+FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
+FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
+
+
+static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
+                                                           0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
+                                                           0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
+        return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
+        __m256i d = _mm256_subs_epi8(a, b);
+        __m256i m = _mm256_cmpgt_epi8(b, a);
+        return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
+        return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
+        __m256i M = _mm256_max_epi16(a, b);
+        __m256i m = _mm256_min_epi16(a, b);
+        return _mm256_subs_epi16(M, m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
+        __m256i d = _mm256_sub_epi32(a, b);
+        __m256i m = _mm256_cmpgt_epi32(b, a);
+        return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
+        return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
+        return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
+    );
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
+
+#elif CV_SSE2
+
+#define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
+    template <>                                                                                  \
+    struct name<template_arg>{                                                                   \
+        typedef register_type reg_type;                                                          \
+        static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
+        static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
+    }
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
+    template <>                                                                \
+    struct name<template_arg>{                                                 \
+        typedef register_type reg_type;                                        \
+        static reg_type load(const template_arg * p) { return load_body (p); } \
+        static void store(template_arg * p, reg_type v) { store_body (p, v); } \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore128<template_arg>::reg_type operator()(                      \
+                        const VLoadStore128<template_arg>::reg_type & a,       \
+                        const VLoadStore128<template_arg>::reg_type & b) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
+    template<>                                                                 \
+    struct name<template_arg>                                                  \
+    {                                                                          \
+        VLoadStore128<template_arg>::reg_type operator()(                      \
+                        const VLoadStore128<template_arg>::reg_type & a,       \
+                        const VLoadStore128<template_arg>::reg_type &  ) const \
+        {                                                                      \
+            body;                                                              \
+        }                                                                      \
+    }
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
+FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
+FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
+FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
+FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
+FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
+FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
+FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
+FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
+
+FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
+FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
+FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
+FUNCTOR_CLOSURE_2arg(VMin, schar,
+        __m128i m = _mm_cmpgt_epi8(a, b);
+        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
+    );
+FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
+FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int,
+        __m128i m = _mm_cmpgt_epi32(a, b);
+        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
+    );
+FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
+FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
+FUNCTOR_CLOSURE_2arg(VMax, schar,
+        __m128i m = _mm_cmpgt_epi8(b, a);
+        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
+    );
+FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int,
+        __m128i m = _mm_cmpgt_epi32(b, a);
+        return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
+    );
+FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
+FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
+
+
+static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
+static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
+        return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
+        __m128i d = _mm_subs_epi8(a, b);
+        __m128i m = _mm_cmpgt_epi8(b, a);
+        return _mm_subs_epi8(_mm_xor_si128(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
+        return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
+        __m128i M = _mm_max_epi16(a, b);
+        __m128i m = _mm_min_epi16(a, b);
+        return _mm_subs_epi16(M, m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
+        __m128i d = _mm_sub_epi32(a, b);
+        __m128i m = _mm_cmpgt_epi32(b, a);
+        return _mm_sub_epi32(_mm_xor_si128(d, m), m);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
+        return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
+    );
+FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
+        return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
+    );
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
+#endif
+
+#if CV_NEON
+
+#define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
+    template <>                                                                \
+    struct name<template_arg>{                                                 \
+        typedef register_type reg_type;                                        \
+        static reg_type load(const template_arg * p) { return load_body (p);}; \
+        static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
+    }
+
+#define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type b) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+#define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
+    template<>                                                         \
+    struct name<template_arg>                                          \
+    {                                                                  \
+        VLoadStore128<template_arg>::reg_type operator()(              \
+                        VLoadStore128<template_arg>::reg_type a,       \
+                        VLoadStore128<template_arg>::reg_type  ) const \
+        {                                                              \
+            return body;                                               \
+        };                                                             \
+    }
+
+FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
+FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
+FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
+FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
+FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
+FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
+
+FUNCTOR_TEMPLATE(VAdd);
+FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VSub);
+FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VMin);
+FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VMax);
+FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
+FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
+FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
+
+FUNCTOR_TEMPLATE(VAbsDiff);
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
+FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
+
+FUNCTOR_TEMPLATE(VAnd);
+FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
+FUNCTOR_TEMPLATE(VOr);
+FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
+FUNCTOR_TEMPLATE(VXor);
+FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
+FUNCTOR_TEMPLATE(VNot);
+FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
+#endif
+
+
+template <typename T>
+struct Cmp_SIMD
+{
+    explicit Cmp_SIMD(int)
+    {
+    }
+
+    int operator () (const T *, const T *, uchar *, int) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct Cmp_SIMD<schar>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdupq_n_u8(255);
+    }
+
+    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_LE)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
+        else if (code == CMP_NE)
+            for ( ; x <= width - 16; x += 16)
+                vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
+
+        return x;
+    }
+
+    int code;
+    uint8x16_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<ushort>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, vmovn_u16(v_dst));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
+                vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<int>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
+                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
+                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+template <>
+struct Cmp_SIMD<float>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        v_mask = vdup_n_u8(255);
+    }
+
+    int operator () (const float * src1, const float * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
+                vst1_u8(dst + x, veor_u8(v_dst, v_mask));
+            }
+
+        return x;
+    }
+
+    int code;
+    uint8x8_t v_mask;
+};
+
+#elif CV_SSE2
+
+template <>
+struct Cmp_SIMD<schar>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+        v_mask = _mm_set1_epi8(-1);
+    }
+
+    int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 16; x += 16)
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
+        else if (code == CMP_LE)
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 16; x += 16)
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                                      _mm_loadu_si128((const __m128i *)(src2 + x))));
+        else if (code == CMP_NE)
+            for ( ; x <= width - 16; x += 16)
+            {
+                __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                              _mm_loadu_si128((const __m128i *)(src2 + x)));
+                _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
+            }
+
+        return x;
+    }
+
+    int code;
+    __m128i v_mask;
+    bool haveSSE;
+};
+
+template <>
+struct Cmp_SIMD<int>
+{
+    explicit Cmp_SIMD(int code_) :
+        code(code_)
+    {
+        // CV_Assert(code == CMP_GT || code == CMP_LE ||
+        //           code == CMP_EQ || code == CMP_NE);
+
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+
+        v_mask = _mm_set1_epi32(0xffffffff);
+    }
+
+    int operator () (const int * src1, const int * src2, uchar * dst, int width) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        if (code == CMP_GT)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+            }
+        else if (code == CMP_LE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
+            }
+        else if (code == CMP_EQ)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
+            }
+        else if (code == CMP_NE)
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x)));
+                __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
+                                                 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
+
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
+            }
+
+        return x;
+    }
+
+    int code;
+    __m128i v_mask;
+    bool haveSSE;
+};
+
+#endif
+
+
+template <typename T, typename WT>
+struct Mul_SIMD
+{
+    int operator() (const T *, const T *, T *, int, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_NEON
+
+template <>
+struct Mul_SIMD<uchar, float>
+{
+    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
+                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1_u8(dst + x, vqmovn_u16(v_dst));
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
+                uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1_u8(dst + x, vqmovn_u16(v_dst));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<schar, float>
+{
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
+                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1_s8(dst + x, vqmovn_s16(v_dst));
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
+                int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1_s8(dst + x, vqmovn_s16(v_dst));
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<ushort, float>
+{
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1q_u16(dst + x, v_dst);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
+                                               vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
+                                                vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
+                vst1q_u16(dst + x, v_dst);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<short, float>
+{
+    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1q_s16(dst + x, v_dst);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+                float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+                float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
+                                               vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
+                                               vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
+                vst1q_s16(dst + x, v_dst);
+            }
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Mul_SIMD<float, float>
+{
+    int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                vst1q_f32(dst + x, v_dst1);
+                vst1q_f32(dst + x + 4, v_dst2);
+            }
+        else
+        {
+            float32x4_t v_scale = vdupq_n_f32(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
+                v_dst1 = vmulq_f32(v_dst1, v_scale);
+
+                float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
+                v_dst2 = vmulq_f32(v_dst2, v_scale);
+
+                vst1q_f32(dst + x, v_dst1);
+                vst1q_f32(dst + x + 4, v_dst2);
+            }
+        }
+
+        return x;
+    }
+};
+
+#elif CV_SSE2
+
+#if CV_SSE4_1
+
+template <>
+struct Mul_SIMD<ushort, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
+                                           _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <>
+struct Mul_SIMD<schar, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale == 1.0f )
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        else
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
+
+                v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+                v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+template <>
+struct Mul_SIMD<short, float>
+{
+    Mul_SIMD()
+    {
+        haveSSE = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
+    {
+        int x = 0;
+
+        if (!haveSSE)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+
+        if( scale != 1.0f )
+        {
+            __m128 v_scale = _mm_set1_ps(scale);
+            for ( ; x <= width - 8; x += 8)
+            {
+                __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
+                __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
+
+                __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
+                v_dst1 = _mm_mul_ps(v_dst1, v_scale);
+
+                __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
+                                           _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
+                v_dst2 = _mm_mul_ps(v_dst2, v_scale);
+
+                __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
+                _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
+            }
+        }
+
+        return x;
+    }
+
+    bool haveSSE;
+};
+
+#endif
+
+template <typename T>
+struct Div_SIMD
+{
+    int operator() (const T *, const T *, T *, int, double) const
+    {
+        return 0;
+    }
+};
+
+template <typename T>
+struct Recip_SIMD
+{
+    int operator() (const T *, T *, int, double) const
+    {
+        return 0;
+    }
+};
+
+
+#if CV_SIMD128
+
+template <>
+struct Div_SIMD<uchar>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_uint16x8 v_src1 = v_load_expand(src1 + x);
+            v_uint16x8 v_src2 = v_load_expand(src2 + x);
+
+            v_uint32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
+
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Div_SIMD<schar>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int16x8 v_src1 = v_load_expand(src1 + x);
+            v_int16x8 v_src2 = v_load_expand(src2 + x);
+
+            v_int32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Div_SIMD<ushort>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_uint16x8 v_src1 = v_load(src1 + x);
+            v_uint16x8 v_src2 = v_load(src2 + x);
+
+            v_uint32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
+
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+            v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
+            v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Div_SIMD<short>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int16x8 v_src1 = v_load(src1 + x);
+            v_int16x8 v_src2 = v_load(src2 + x);
+
+            v_int32x4 t0, t1, t2, t3;
+            v_expand(v_src1, t0, t1);
+            v_expand(v_src2, t2, t3);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Div_SIMD<int>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int32x4 v_zero = v_setzero_s32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int32x4 t0 = v_load(src1 + x);
+            v_int32x4 t1 = v_load(src1 + x + 4);
+            v_int32x4 t2 = v_load(src2 + x);
+            v_int32x4 t3 = v_load(src2 + x + 4);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+            v_float32x4 f2 = v_cvt_f32(t2);
+            v_float32x4 f3 = v_cvt_f32(t3);
+
+            f0 = f0 * v_scale / f2;
+            f1 = f1 * v_scale / f3;
+
+            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
+
+            res0 = v_select(t2 == v_zero, v_zero, res0);
+            res1 = v_select(t3 == v_zero, v_zero, res1);
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Div_SIMD<float>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_float32x4 v_zero = v_setzero_f32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_float32x4 f0 = v_load(src1 + x);
+            v_float32x4 f1 = v_load(src1 + x + 4);
+            v_float32x4 f2 = v_load(src2 + x);
+            v_float32x4 f3 = v_load(src2 + x + 4);
+
+            v_float32x4 res0 = f0 * v_scale / f2;
+            v_float32x4 res1 = f1 * v_scale / f3;
+
+            res0 = v_select(f2 == v_zero, v_zero, res0);
+            res1 = v_select(f3 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+
+///////////////////////// RECIPROCAL //////////////////////
+
+template <>
+struct Recip_SIMD<uchar>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const uchar * src2, uchar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_uint16x8 v_src2 = v_load_expand(src2 + x);
+
+            v_uint32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
+
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Recip_SIMD<schar>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const schar * src2, schar * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int16x8 v_src2 = v_load_expand(src2 + x);
+
+            v_int32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_pack_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Recip_SIMD<ushort>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const ushort * src2, ushort * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_uint16x8 v_zero = v_setzero_u16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_uint16x8 v_src2 = v_load(src2 + x);
+
+            v_uint32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
+
+            v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
+            v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_uint16x8 res = v_pack_u(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Recip_SIMD<short>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const short * src2, short * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int16x8 v_zero = v_setzero_s16();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int16x8 v_src2 = v_load(src2 + x);
+
+            v_int32x4 t0, t1;
+            v_expand(v_src2, t0, t1);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
+            v_int16x8 res = v_pack(i0, i1);
+
+            res = v_select(v_src2 == v_zero, v_zero, res);
+            v_store(dst + x, res);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Recip_SIMD<int>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const int * src2, int * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_int32x4 v_zero = v_setzero_s32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_int32x4 t0 = v_load(src2 + x);
+            v_int32x4 t1 = v_load(src2 + x + 4);
+
+            v_float32x4 f0 = v_cvt_f32(t0);
+            v_float32x4 f1 = v_cvt_f32(t1);
+
+            f0 = v_scale / f0;
+            f1 = v_scale / f1;
+
+            v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
+
+            res0 = v_select(t0 == v_zero, v_zero, res0);
+            res1 = v_select(t1 == v_zero, v_zero, res1);
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+
+template <>
+struct Recip_SIMD<float>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const float * src2, float * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float32x4 v_scale = v_setall_f32((float)scale);
+        v_float32x4 v_zero = v_setzero_f32();
+
+        for ( ; x <= width - 8; x += 8)
+        {
+            v_float32x4 f0 = v_load(src2 + x);
+            v_float32x4 f1 = v_load(src2 + x + 4);
+
+            v_float32x4 res0 = v_scale / f0;
+            v_float32x4 res1 = v_scale / f1;
+
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 4, res1);
+        }
+
+        return x;
+    }
+};
+
+#if CV_SIMD128_64F
+
+template <>
+struct Div_SIMD<double>
+{
+    bool haveSIMD;
+    Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float64x2 v_scale = v_setall_f64(scale);
+        v_float64x2 v_zero = v_setzero_f64();
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            v_float64x2 f0 = v_load(src1 + x);
+            v_float64x2 f1 = v_load(src1 + x + 2);
+            v_float64x2 f2 = v_load(src2 + x);
+            v_float64x2 f3 = v_load(src2 + x + 2);
+
+            v_float64x2 res0 = f0 * v_scale / f2;
+            v_float64x2 res1 = f1 * v_scale / f3;
+
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 2, res1);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct Recip_SIMD<double>
+{
+    bool haveSIMD;
+    Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
+
+    int operator() (const double * src2, double * dst, int width, double scale) const
+    {
+        int x = 0;
+
+        if (!haveSIMD)
+            return x;
+
+        v_float64x2 v_scale = v_setall_f64(scale);
+        v_float64x2 v_zero = v_setzero_f64();
+
+        for ( ; x <= width - 4; x += 4)
+        {
+            v_float64x2 f0 = v_load(src2 + x);
+            v_float64x2 f1 = v_load(src2 + x + 2);
+
+            v_float64x2 res0 = v_scale / f0;
+            v_float64x2 res1 = v_scale / f1;
+
+            res0 = v_select(f0 == v_zero, v_zero, res0);
+            res1 = v_select(f1 == v_zero, v_zero, res1);
+
+            v_store(dst + x, res0);
+            v_store(dst + x + 2, res1);
+        }
+
+        return x;
+    }
+};
+
+#endif
+
+#endif
+
+
+template <typename T, typename WT>
+struct AddWeighted_SIMD
+{
+    int operator() (const T *, const T *, T *, int, WT, WT, WT) const
+    {
+        return 0;
+    }
+};
+
+#if CV_SSE2
+
+template <>
+struct AddWeighted_SIMD<schar, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
+
+            __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
+            __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
+
+            __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                              _mm_cvtps_epi32(v_dstf1));
+
+            _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+template <>
+struct AddWeighted_SIMD<short, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+    }
+
+    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE2)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                   _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE2;
+};
+
+#if CV_SSE4_1
+
+template <>
+struct AddWeighted_SIMD<ushort, float>
+{
+    AddWeighted_SIMD()
+    {
+        haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
+    }
+
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        if (!haveSSE4_1)
+            return x;
+
+        __m128i v_zero = _mm_setzero_si128();
+        __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
+               v_gamma = _mm_set1_ps(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
+            __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
+
+            __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
+
+            __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
+            v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
+                                 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
+
+            _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
+                                                                    _mm_cvtps_epi32(v_dstf1)));
+        }
+
+        return x;
+    }
+
+    bool haveSSE4_1;
+};
+
+#endif
+
+#elif CV_NEON
+
+template <>
+struct AddWeighted_SIMD<schar, float>
+{
+    int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32 (gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int8x8_t in1 = vld1_s8(src1 + x);
+            int16x8_t in1_16 = vmovl_s8(in1);
+            float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
+            float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
+
+            int8x8_t in2 = vld1_s8(src2+x);
+            int16x8_t in2_16 = vmovl_s8(in2);
+            float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
+            float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
+
+            float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
+            float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
+            out_f_l = vaddq_f32(out_f_l, g);
+            out_f_h = vaddq_f32(out_f_h, g);
+
+            int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
+            int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
+
+            int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
+            int8x8_t out = vqmovn_s16(out_16);
+
+            vst1_s8(dst + x, out);
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AddWeighted_SIMD<ushort, float>
+{
+    int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
+
+            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
+            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
+            uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
+            v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
+            uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
+        }
+
+        return x;
+    }
+};
+
+template <>
+struct AddWeighted_SIMD<short, float>
+{
+    int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
+    {
+        int x = 0;
+
+        float32x4_t g = vdupq_n_f32(gamma);
+
+        for( ; x <= width - 8; x += 8 )
+        {
+            int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
+
+            float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
+            float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
+            int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
+            v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
+            int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
+
+            vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
+        }
+
+        return x;
+    }
+};
+
+#endif
+
+}}
+
+#endif // __OPENCV_HAL_ARITHM_SIMD_HPP__
diff --git a/modules/hal/src/hardware.cpp b/modules/hal/src/hardware.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a08b9f44af965c192174ebc06b363e1fab8a7ef
--- /dev/null
+++ b/modules/hal/src/hardware.cpp
@@ -0,0 +1,221 @@
+#include "precomp.hpp"
+
+#if defined WIN32 || defined _WIN32 || defined WINCE
+#include <windows.h>
+#if defined _MSC_VER
+  #if _MSC_VER >= 1400
+    #include <intrin.h>
+  #elif defined _M_IX86
+    static void __cpuid(int* cpuid_data, int)
+    {
+        __asm
+        {
+            push ebx
+            push edi
+            mov edi, cpuid_data
+            mov eax, 1
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+            pop ebx
+        }
+    }
+    static void __cpuidex(int* cpuid_data, int, int)
+    {
+        __asm
+        {
+            push edi
+            mov edi, cpuid_data
+            mov eax, 7
+            mov ecx, 0
+            cpuid
+            mov [edi], eax
+            mov [edi + 4], ebx
+            mov [edi + 8], ecx
+            mov [edi + 12], edx
+            pop edi
+        }
+    }
+  #endif
+#endif
+#endif
+
+#if defined ANDROID || defined __linux__
+#  include <unistd.h>
+#  include <fcntl.h>
+#  include <elf.h>
+#  include <linux/auxvec.h>
+#endif
+
+#if defined __linux__ || defined __APPLE__ || defined __EMSCRIPTEN__
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+#if defined ANDROID
+#include <sys/sysconf.h>
+#endif
+#endif
+
+#ifdef ANDROID
+# include <android/log.h>
+#endif
+
+struct HWFeatures
+{
+    enum { MAX_FEATURE = CV_HARDWARE_MAX_FEATURE };
+
+    HWFeatures(void)
+    {
+        memset( have, 0, sizeof(have) );
+        x86_family = 0;
+    }
+
+    static HWFeatures initialize(void)
+    {
+        HWFeatures f;
+        int cpuid_data[4] = { 0, 0, 0, 0 };
+
+    #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+        __cpuid(cpuid_data, 1);
+    #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+        #ifdef __x86_64__
+        asm __volatile__
+        (
+         "movl $1, %%eax\n\t"
+         "cpuid\n\t"
+         :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #else
+        asm volatile
+        (
+         "pushl %%ebx\n\t"
+         "movl $1,%%eax\n\t"
+         "cpuid\n\t"
+         "popl %%ebx\n\t"
+         : "=a"(cpuid_data[0]), "=c"(cpuid_data[2]), "=d"(cpuid_data[3])
+         :
+         : "cc"
+        );
+        #endif
+    #endif
+
+        f.x86_family = (cpuid_data[0] >> 8) & 15;
+        if( f.x86_family >= 6 )
+        {
+            f.have[CV_CPU_MMX]    = (cpuid_data[3] & (1 << 23)) != 0;
+            f.have[CV_CPU_SSE]    = (cpuid_data[3] & (1<<25)) != 0;
+            f.have[CV_CPU_SSE2]   = (cpuid_data[3] & (1<<26)) != 0;
+            f.have[CV_CPU_SSE3]   = (cpuid_data[2] & (1<<0)) != 0;
+            f.have[CV_CPU_SSSE3]  = (cpuid_data[2] & (1<<9)) != 0;
+            f.have[CV_CPU_FMA3]  = (cpuid_data[2] & (1<<12)) != 0;
+            f.have[CV_CPU_SSE4_1] = (cpuid_data[2] & (1<<19)) != 0;
+            f.have[CV_CPU_SSE4_2] = (cpuid_data[2] & (1<<20)) != 0;
+            f.have[CV_CPU_POPCNT] = (cpuid_data[2] & (1<<23)) != 0;
+            f.have[CV_CPU_AVX]    = (((cpuid_data[2] & (1<<28)) != 0)&&((cpuid_data[2] & (1<<27)) != 0));//OS uses XSAVE_XRSTORE and CPU support AVX
+
+            // make the second call to the cpuid command in order to get
+            // information about extended features like AVX2
+        #if defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+            __cpuidex(cpuid_data, 7, 0);
+        #elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+            #ifdef __x86_64__
+            asm __volatile__
+            (
+             "movl $7, %%eax\n\t"
+             "movl $0, %%ecx\n\t"
+             "cpuid\n\t"
+             :[eax]"=a"(cpuid_data[0]),[ebx]"=b"(cpuid_data[1]),[ecx]"=c"(cpuid_data[2]),[edx]"=d"(cpuid_data[3])
+             :
+             : "cc"
+            );
+            #else
+            asm volatile
+            (
+             "pushl %%ebx\n\t"
+             "movl $7,%%eax\n\t"
+             "movl $0,%%ecx\n\t"
+             "cpuid\n\t"
+             "movl %%ebx, %0\n\t"
+             "popl %%ebx\n\t"
+             : "=r"(cpuid_data[1]), "=c"(cpuid_data[2])
+             :
+             : "cc"
+            );
+            #endif
+        #endif
+            f.have[CV_CPU_AVX2]   = (cpuid_data[1] & (1<<5)) != 0;
+
+            f.have[CV_CPU_AVX_512F]       = (cpuid_data[1] & (1<<16)) != 0;
+            f.have[CV_CPU_AVX_512DQ]      = (cpuid_data[1] & (1<<17)) != 0;
+            f.have[CV_CPU_AVX_512IFMA512] = (cpuid_data[1] & (1<<21)) != 0;
+            f.have[CV_CPU_AVX_512PF]      = (cpuid_data[1] & (1<<26)) != 0;
+            f.have[CV_CPU_AVX_512ER]      = (cpuid_data[1] & (1<<27)) != 0;
+            f.have[CV_CPU_AVX_512CD]      = (cpuid_data[1] & (1<<28)) != 0;
+            f.have[CV_CPU_AVX_512BW]      = (cpuid_data[1] & (1<<30)) != 0;
+            f.have[CV_CPU_AVX_512VL]      = (cpuid_data[1] & (1<<31)) != 0;
+            f.have[CV_CPU_AVX_512VBMI]    = (cpuid_data[2] &  (1<<1)) != 0;
+        }
+
+    #if defined ANDROID || defined __linux__
+    #ifdef __aarch64__
+        f.have[CV_CPU_NEON] = true;
+    #else
+        int cpufile = open("/proc/self/auxv", O_RDONLY);
+
+        if (cpufile >= 0)
+        {
+            Elf32_auxv_t auxv;
+            const size_t size_auxv_t = sizeof(auxv);
+
+            while ((size_t)read(cpufile, &auxv, size_auxv_t) == size_auxv_t)
+            {
+                if (auxv.a_type == AT_HWCAP)
+                {
+                    f.have[CV_CPU_NEON] = (auxv.a_un.a_val & 4096) != 0;
+                    break;
+                }
+            }
+
+            close(cpufile);
+        }
+    #endif
+    #elif (defined __clang__ || defined __APPLE__) && (defined __ARM_NEON__ || (defined __ARM_NEON && defined __aarch64__))
+        f.have[CV_CPU_NEON] = true;
+    #endif
+
+        return f;
+    }
+
+    int x86_family;
+    bool have[MAX_FEATURE+1];
+};
+
+static HWFeatures  featuresEnabled = HWFeatures::initialize(), featuresDisabled = HWFeatures();
+static HWFeatures* currentFeatures = &featuresEnabled;
+volatile bool useOptimizedFlag = true;
+
+namespace cv { namespace hal {
+
+bool checkHardwareSupport(int feature)
+{
+//    CV_DbgAssert( 0 <= feature && feature <= CV_HARDWARE_MAX_FEATURE );
+    return currentFeatures->have[feature];
+}
+
+void setUseOptimized( bool flag )
+{
+    useOptimizedFlag = flag;
+    currentFeatures = flag ? &featuresEnabled : &featuresDisabled;
+}
+
+bool useOptimized(void)
+{
+    return useOptimizedFlag;
+}
+
+}}
diff --git a/modules/hal/src/merge.cpp b/modules/hal/src/merge.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..982b24c2505cffaf33c02e46d23da62a4b93d14c
--- /dev/null
+++ b/modules/hal/src/merge.cpp
@@ -0,0 +1,408 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+#if CV_NEON
+template<typename T> struct VMerge2;
+template<typename T> struct VMerge3;
+template<typename T> struct VMerge4;
+
+#define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, data_type* dst){                   \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+#define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>{                                                       \
+        void operator()(const data_type* src0, const data_type* src1,             \
+                        const data_type* src2, const data_type* src3,             \
+                        data_type* dst){                                          \
+            reg_type r;                                                           \
+            r.val[0] = load_func(src0);                                           \
+            r.val[1] = load_func(src1);                                           \
+            r.val[2] = load_func(src2);                                           \
+            r.val[3] = load_func(src3);                                           \
+            store_func(dst, r);                                                   \
+        }                                                                         \
+    }
+
+MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
+MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
+MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
+
+MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
+MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
+MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
+
+MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
+MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
+MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VMerge2
+{
+    VMerge2() : support(false) { }
+    void operator()(const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge3
+{
+    VMerge3() : support(false) { }
+    void operator()(const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VMerge4
+{
+    VMerge4() : support(false) { }
+    void operator()(const T *, const T *, const T *, const T *, T *) const { }
+
+    bool support;
+};
+
+#define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2,                                             \
+                       v_src3, v_src4, v_src5);                                            \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
+template <>                                                                                \
+struct VMerge4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VMerge4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(se);                                                \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src0, const data_type * src1,                        \
+                    const data_type * src2, const data_type * src3,                        \
+                    data_type * dst) const                                                 \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
+        reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
+        reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
+        reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
+        reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
+        reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
+        reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
+        reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
+                                                                                           \
+        _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
+                       v_src4, v_src5, v_src6, v_src7);                                    \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
+        _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
+
+#if CV_SSE4_1
+MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
+#endif
+
+MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
+
+#endif
+
+template<typename T> static void
+merge_( const T** src, T* dst, int len, int cn )
+{
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        const T* src0 = src[0];
+        for( i = j = 0; i < len; i++, j += cn )
+            dst[j] = src0[i];
+    }
+    else if( k == 2 )
+    {
+        const T *src0 = src[0], *src1 = src[1];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VMerge2<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+        }
+    }
+    else if( k == 3 )
+    {
+        const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VMerge3<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i];
+            dst[j+1] = src1[i];
+            dst[j+2] = src2[i];
+        }
+    }
+    else
+    {
+        const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
+        i = j = 0;
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
+#elif CV_SSE2
+        if(cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VMerge4<T> vmerge;
+            if (vmerge.support)
+                for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                    vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+
+    for( ; k < cn; k += 4 )
+    {
+        const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst[j] = src0[i]; dst[j+1] = src1[i];
+            dst[j+2] = src2[i]; dst[j+3] = src3[i];
+        }
+    }
+}
+
+
+void merge8u(const uchar** src, uchar* dst, int len, int cn )
+{
+    merge_(src, dst, len, cn);
+}
+
+void merge16u(const ushort** src, ushort* dst, int len, int cn )
+{
+    merge_(src, dst, len, cn);
+}
+
+void merge32s(const int** src, int* dst, int len, int cn )
+{
+    merge_(src, dst, len, cn);
+}
+
+void merge64s(const int64** src, int64* dst, int len, int cn )
+{
+    merge_(src, dst, len, cn);
+}
+
+}}
diff --git a/modules/hal/src/precomp.hpp b/modules/hal/src/precomp.hpp
index 630565bec3851e0b5780c3c44d3fac006bb7a1c9..16586368e4aadaa7ff9691149190a82359804afe 100644
--- a/modules/hal/src/precomp.hpp
+++ b/modules/hal/src/precomp.hpp
@@ -47,3 +47,14 @@
 #include <cstdlib>
 #include <limits>
 #include <float.h>
+#include <cstring>
+#include <cassert>
+
+#include "opencv2/hal/sse_utils.hpp"
+#include "opencv2/hal/neon_utils.hpp"
+
+#if defined HAVE_IPP && (IPP_VERSION_X100 >= 700)
+#define ARITHM_USE_IPP 1
+#else
+#define ARITHM_USE_IPP 0
+#endif
diff --git a/modules/hal/src/replacement.hpp b/modules/hal/src/replacement.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8cc19224e5ea595e4b708ebeae8707d14a3c239
--- /dev/null
+++ b/modules/hal/src/replacement.hpp
@@ -0,0 +1,208 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_HAL_REPLACEMENT_HPP__
+#define __OPENCV_HAL_REPLACEMENT_HPP__
+
+#include "opencv2/hal.hpp"
+
+inline int hal_t_add8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_add64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_sub64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_max64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_min64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff16s(const short*, size_t, const short*, size_t, short*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff32s(const int*, size_t, const int*, size_t, int*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff32f(const float*, size_t, const float*, size_t, float*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_absdiff64f(const double*, size_t, const double*, size_t, double*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_and8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_or8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_xor8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_not8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int) { return cv::hal::Error::NotImplemented; }
+
+#define hal_add8u hal_t_add8u
+#define hal_add8s hal_t_add8s
+#define hal_add16u hal_t_add16u
+#define hal_add16s hal_t_add16s
+#define hal_add32s hal_t_add32s
+#define hal_add32f hal_t_add32f
+#define hal_add64f hal_t_add64f
+#define hal_sub8u hal_t_sub8u
+#define hal_sub8s hal_t_sub8s
+#define hal_sub16u hal_t_sub16u
+#define hal_sub16s hal_t_sub16s
+#define hal_sub32s hal_t_sub32s
+#define hal_sub32f hal_t_sub32f
+#define hal_sub64f hal_t_sub64f
+#define hal_max8u hal_t_max8u
+#define hal_max8s hal_t_max8s
+#define hal_max16u hal_t_max16u
+#define hal_max16s hal_t_max16s
+#define hal_max32s hal_t_max32s
+#define hal_max32f hal_t_max32f
+#define hal_max64f hal_t_max64f
+#define hal_min8u hal_t_min8u
+#define hal_min8s hal_t_min8s
+#define hal_min16u hal_t_min16u
+#define hal_min16s hal_t_min16s
+#define hal_min32s hal_t_min32s
+#define hal_min32f hal_t_min32f
+#define hal_min64f hal_t_min64f
+#define hal_absdiff8u hal_t_absdiff8u
+#define hal_absdiff8s hal_t_absdiff8s
+#define hal_absdiff16u hal_t_absdiff16u
+#define hal_absdiff16s hal_t_absdiff16s
+#define hal_absdiff32s hal_t_absdiff32s
+#define hal_absdiff32f hal_t_absdiff32f
+#define hal_absdiff64f hal_t_absdiff64f
+#define hal_and8u hal_t_and8u
+#define hal_or8u hal_t_or8u
+#define hal_xor8u hal_t_xor8u
+#define hal_not8u hal_t_not8u
+
+inline int hal_t_cmp8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp8s(const schar*, size_t, const schar*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp16u(const ushort*, size_t, const ushort*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp16s(const short*, size_t, const short*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp32s(const int*, size_t, const int*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp32f(const float*, size_t, const float*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_cmp64f(const double*, size_t, const double*, size_t, uchar*, size_t, int, int, int) { return cv::hal::Error::NotImplemented; }
+
+#define hal_cmp8u hal_t_cmp8u
+#define hal_cmp8s hal_t_cmp8s
+#define hal_cmp16u hal_t_cmp16u
+#define hal_cmp16s hal_t_cmp16s
+#define hal_cmp32s hal_t_cmp32s
+#define hal_cmp32f hal_t_cmp32f
+#define hal_cmp64f hal_t_cmp64f
+
+inline int hal_t_mul8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_mul64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_div64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_recip64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, double) { return cv::hal::Error::NotImplemented; }
+
+#define hal_mul8u hal_t_mul8u
+#define hal_mul8s hal_t_mul8s
+#define hal_mul16u hal_t_mul16u
+#define hal_mul16s hal_t_mul16s
+#define hal_mul32s hal_t_mul32s
+#define hal_mul32f hal_t_mul32f
+#define hal_mul64f hal_t_mul64f
+#define hal_div8u hal_t_div8u
+#define hal_div8s hal_t_div8s
+#define hal_div16u hal_t_div16u
+#define hal_div16s hal_t_div16s
+#define hal_div32s hal_t_div32s
+#define hal_div32f hal_t_div32f
+#define hal_div64f hal_t_div64f
+#define hal_recip8u hal_t_recip8u
+#define hal_recip8s hal_t_recip8s
+#define hal_recip16u hal_t_recip16u
+#define hal_recip16s hal_t_recip16s
+#define hal_recip32s hal_t_recip32s
+#define hal_recip32f hal_t_recip32f
+#define hal_recip64f hal_t_recip64f
+
+inline int hal_t_addWeighted8u(const uchar*, size_t, const uchar*, size_t, uchar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted8s(const schar*, size_t, const schar*, size_t, schar*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted16u(const ushort*, size_t, const ushort*, size_t, ushort*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted16s(const short*, size_t, const short*, size_t, short*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted32s(const int*, size_t, const int*, size_t, int*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted32f(const float*, size_t, const float*, size_t, float*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+inline int hal_t_addWeighted64f(const double*, size_t, const double*, size_t, double*, size_t, int, int, void*) { return cv::hal::Error::NotImplemented; }
+
+#define hal_addWeighted8u hal_t_addWeighted8u
+#define hal_addWeighted8s hal_t_addWeighted8s
+#define hal_addWeighted16u hal_t_addWeighted16u
+#define hal_addWeighted16s hal_t_addWeighted16s
+#define hal_addWeighted32s hal_t_addWeighted32s
+#define hal_addWeighted32f hal_t_addWeighted32f
+#define hal_addWeighted64f hal_t_addWeighted64f
+
+#include "custom_hal.hpp"
+
+#endif
diff --git a/modules/hal/src/split.cpp b/modules/hal/src/split.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c31bf8cc44e9cfc2b3c7b73106c89ab8c4bf6bc9
--- /dev/null
+++ b/modules/hal/src/split.cpp
@@ -0,0 +1,424 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+namespace cv { namespace hal {
+
+#if CV_NEON
+template<typename T> struct VSplit2;
+template<typename T> struct VSplit3;
+template<typename T> struct VSplit4;
+
+#define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0,                    \
+                        data_type* dst1) const                                    \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2) const                                    \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+        }                                                                         \
+    }
+
+#define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
+    template<>                                                                    \
+    struct name<data_type>                                                        \
+    {                                                                             \
+        void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
+                        data_type* dst2, data_type* dst3) const                   \
+        {                                                                         \
+            reg_type r = load_func(src);                                          \
+            store_func(dst0, r.val[0]);                                           \
+            store_func(dst1, r.val[1]);                                           \
+            store_func(dst2, r.val[2]);                                           \
+            store_func(dst3, r.val[3]);                                           \
+        }                                                                         \
+    }
+
+SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
+SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
+SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
+
+SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
+SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
+SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
+
+SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
+SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
+SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
+
+#elif CV_SSE2
+
+template <typename T>
+struct VSplit2
+{
+    VSplit2() : support(false) { }
+    void operator()(const T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit3
+{
+    VSplit3() : support(false) { }
+    void operator()(const T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+template <typename T>
+struct VSplit4
+{
+    VSplit4() : support(false) { }
+    void operator()(const T *, T *, T *, T *, T *) const { }
+
+    bool support;
+};
+
+#define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit2<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit2()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit3<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit3()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src,                                                 \
+                    data_type * dst0, data_type * dst1, data_type * dst2) const            \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
+                         v_src3, v_src4, v_src5);                                          \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+#define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
+template <>                                                                                \
+struct VSplit4<data_type>                                                                  \
+{                                                                                          \
+    enum                                                                                   \
+    {                                                                                      \
+        ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
+    };                                                                                     \
+                                                                                           \
+    VSplit4()                                                                              \
+    {                                                                                      \
+        support = checkHardwareSupport(CV_CPU_SSE2);                                       \
+    }                                                                                      \
+                                                                                           \
+    void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
+                    data_type * dst2, data_type * dst3) const                              \
+    {                                                                                      \
+        reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
+        reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
+        reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
+        reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
+        reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
+        reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
+        reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
+        reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
+                                                                                           \
+        _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
+                         v_src4, v_src5, v_src6, v_src7);                                  \
+                                                                                           \
+        _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
+        _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
+        _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
+        _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
+        _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
+    }                                                                                      \
+                                                                                           \
+    bool support;                                                                          \
+}
+
+SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
+SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
+SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
+
+#endif
+
+template<typename T> static void
+split_( const T* src, T** dst, int len, int cn )
+{
+    int k = cn % 4 ? cn % 4 : 4;
+    int i, j;
+    if( k == 1 )
+    {
+        T* dst0 = dst[0];
+
+        if(cn == 1)
+        {
+            memcpy(dst0, src, len * sizeof(T));
+        }
+        else
+        {
+            for( i = 0, j = 0 ; i < len; i++, j += cn )
+                dst0[i] = src[j];
+        }
+    }
+    else if( k == 2 )
+    {
+        T *dst0 = dst[0], *dst1 = dst[1];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 2)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            for( ; i < len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i);
+        }
+#elif CV_SSE2
+        if (cn == 2)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 2 * inc_i;
+
+            VSplit2<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j];
+            dst1[i] = src[j+1];
+        }
+    }
+    else if( k == 3 )
+    {
+        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 3)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+        }
+#elif CV_SSE2
+        if (cn == 3)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 3 * inc_i;
+
+            VSplit3<T> vsplit;
+
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j];
+            dst1[i] = src[j+1];
+            dst2[i] = src[j+2];
+        }
+    }
+    else
+    {
+        T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
+        i = j = 0;
+
+#if CV_NEON
+        if(cn == 4)
+        {
+            int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+        }
+#elif CV_SSE2
+        if (cn == 4)
+        {
+            int inc_i = 32/sizeof(T);
+            int inc_j = 4 * inc_i;
+
+            VSplit4<T> vsplit;
+            if (vsplit.support)
+            {
+                for( ; i <= len - inc_i; i += inc_i, j += inc_j)
+                    vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
+            }
+        }
+#endif
+        for( ; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j]; dst1[i] = src[j+1];
+            dst2[i] = src[j+2]; dst3[i] = src[j+3];
+        }
+    }
+
+    for( ; k < cn; k += 4 )
+    {
+        T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
+        for( i = 0, j = k; i < len; i++, j += cn )
+        {
+            dst0[i] = src[j]; dst1[i] = src[j+1];
+            dst2[i] = src[j+2]; dst3[i] = src[j+3];
+        }
+    }
+}
+
+void split8u(const uchar* src, uchar** dst, int len, int cn )
+{
+    split_(src, dst, len, cn);
+}
+
+void split16u(const ushort* src, ushort** dst, int len, int cn )
+{
+    split_(src, dst, len, cn);
+}
+
+void split32s(const int* src, int** dst, int len, int cn )
+{
+    split_(src, dst, len, cn);
+}
+
+void split64s(const int64* src, int64** dst, int len, int cn )
+{
+    split_(src, dst, len, cn);
+}
+
+}}
diff --git a/modules/imgproc/src/precomp.hpp b/modules/imgproc/src/precomp.hpp
index 7a0cece2f27c5b78810143f0f53d19409d239eeb..3bb8d8e760f6a2cf65c68c02dfdf52bf10231b82 100644
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@@ -94,4 +94,6 @@ extern const float icv8x32fSqrTab[];
 #include "_geom.h"
 #include "filterengine.hpp"
 
+#include "opencv2/hal/sse_utils.hpp"
+
 #endif /*__OPENCV_CV_INTERNAL_H_*/