gpuarithm module for arithmetics operations on matrices

31c8b527 · Vladislav Vinogradov · 1b00a3ed · 31c8b527 · 31c8b527 · 31c8b527
63 changed file
--- a/modules/core/include/opencv2/core/cuda/emulation.hpp
+++ b/modules/core/include/opencv2/core/cuda/emulation.hpp
@@ -43,6 +43,7 @@
 #ifndef OPENCV_GPU_EMULATION_HPP_
 #define OPENCV_GPU_EMULATION_HPP_

+#include "common.hpp"
 #include "warp_reduce.hpp"

 namespace cv { namespace gpu { namespace cudev
@@ -131,8 +132,130 @@ namespace cv { namespace gpu { namespace cudev
                return ::atomicMin(address, val);
 #endif
            }
+        }; // struct cmem
+
+        struct glob
+        {
+            static __device__ __forceinline__ int atomicAdd(int* address, int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ unsigned int atomicAdd(unsigned int* address, unsigned int val)
+            {
+                return ::atomicAdd(address, val);
+            }
+            static __device__ __forceinline__ float atomicAdd(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 200
+                return ::atomicAdd(address, val);
+            #else
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(val + __int_as_float(assumed)));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #endif
+            }
+            static __device__ __forceinline__ double atomicAdd(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                (void) address;
+                (void) val;
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMin(int* address, int val)
+            {
+                return ::atomicMin(address, val);
+            }
+            static __device__ __forceinline__ float atomicMin(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fminf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                (void) address;
+                (void) val;
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMin(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                (void) address;
+                (void) val;
+                return 0.0;
+            #endif
+            }
+
+            static __device__ __forceinline__ int atomicMax(int* address, int val)
+            {
+                return ::atomicMax(address, val);
+            }
+            static __device__ __forceinline__ float atomicMax(float* address, float val)
+            {
+            #if __CUDA_ARCH__ >= 120
+                int* address_as_i = (int*) address;
+                int old = *address_as_i, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_i, assumed,
+                        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+                } while (assumed != old);
+                return __int_as_float(old);
+            #else
+                (void) address;
+                (void) val;
+                return 0.0f;
+            #endif
+            }
+            static __device__ __forceinline__ double atomicMax(double* address, double val)
+            {
+            #if __CUDA_ARCH__ >= 130
+                unsigned long long int* address_as_ull = (unsigned long long int*) address;
+                unsigned long long int old = *address_as_ull, assumed;
+                do {
+                    assumed = old;
+                    old = ::atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
+                } while (assumed != old);
+                return __longlong_as_double(old);
+            #else
+                (void) address;
+                (void) val;
+                return 0.0;
+            #endif
+            }
        };
-    };
+    }; //struct Emulation
 }}} // namespace cv { namespace gpu { namespace cudev

 #endif /* OPENCV_GPU_EMULATION_HPP_ */
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
 endif()

 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy opencv_gpuarithm)

 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

@@ -58,10 +58,6 @@ if(HAVE_CUDA)
    CUDA_ADD_CUFFT_TO_TARGET(${the_module})
  endif()

-  if(HAVE_CUBLAS)
-    CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
-  endif()
-
  install(FILES src/nvidia/NPP_staging/NPP_staging.hpp  src/nvidia/core/NCV.hpp
    DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2/${name}
    COMPONENT main)

--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -8,10 +8,7 @@ gpu. GPU-accelerated Computer Vision
    introduction
    initalization_and_information
    data_structures
-    operations_on_matrices
-    per_element_operations
    image_processing
-    matrix_reductions
    object_detection
    feature_detection_and_description
    image_filtering

--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
@@ -414,28 +414,6 @@ The methods support arbitrary permutations of the original channels, including r



-gpu::threshold
------------------
-Applies a fixed-level threshold to each array element.
-
-.. ocv:function:: double gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
-
-    :param src: Source array (single-channel).
-
-    :param dst: Destination array with the same size and type as  ``src`` .
-
-    :param thresh: Threshold value.
-
-    :param maxval: Maximum value to use with  ``THRESH_BINARY`` and  ``THRESH_BINARY_INV`` threshold types.
-
-    :param type: Threshold type. For details, see  :ocv:func:`threshold` . The ``THRESH_OTSU`` threshold type is not supported.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`threshold`
-
-
-
 gpu::resize
 ---------------
 Resizes an image.

--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -50,6 +50,7 @@
 #endif

 #include "opencv2/core/gpumat.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/features2d.hpp"
@@ -269,182 +270,8 @@ CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat&
 //! supports only ksize = 1 and ksize = 3
 CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());

+////////////////////////////// Image processing //////////////////////////////

-////////////////////////////// Arithmetics ///////////////////////////////////
-
-//! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
-    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
-
-//! transposes the matrix
-//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
-CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! reverses the order of the rows, columns or both in a matrix
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
-CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
-
-//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
-//! destination array will have the depth type as lut and the same channels number as source
-//! supports CV_8UC1, CV_8UC3 types
-CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
-
-//! computes magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitude(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitudeSqr(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes angle (angle(i)) of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts Cartesian coordinates to polar
-//! supports only floating-point source
-CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts polar coordinates to Cartesian
-//! supports only floating-point source
-CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
-                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
-                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
-
-
-//////////////////////////// Per-element operations ////////////////////////////////////
-
-//! adds one matrix to another (c = a + b)
-CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! adds scalar to a matrix (c = a + s)
-CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-
-//! subtracts one matrix from another (c = a - b)
-CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! subtracts scalar from a matrix (c = a - s)
-CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes element-wise weighted product of the two arrays (c = scale * a * b)
-CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! weighted multiplies matrix to a scalar (c = scale * a * s)
-CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes element-wise weighted quotient of the two arrays (c = a / b)
-CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
-CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
-CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
-                            int dtype = -1, Stream& stream = Stream::Null());
-
-//! adds scaled array to another one (dst = alpha*src1 + src2)
-static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-{
-    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
-}
-
-//! computes element-wise absolute difference of two arrays (c = abs(a - b))
-CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
-//! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
-
-//! computes absolute value of each matrix element
-//! supports CV_16S and CV_32F depth
-CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes square of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes square root of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes exponent of each matrix element (b = e**a)
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
-
-//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
-
-//! computes power of each matrix element:
-//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
-//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
-//! supports all, except depth == CV_64F
-CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! compares elements of two arrays (c = a <cmpop> b)
-CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-
-//! performs per-elements bit-wise inversion
-CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise disjunction of two arrays
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise disjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise conjunction of two arrays
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise conjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise "exclusive or" operation
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise "exclusive or" of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel right shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with integers elements
-CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel left shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of array and scalar (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of array and scalar (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());

 enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
@@ -453,9 +280,6 @@ enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA
 //! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
 CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());

-
-////////////////////////////// Image processing //////////////////////////////
-
 //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
 //! supports only CV_32FC1 map type
 CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
@@ -521,9 +345,6 @@ CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& strea
 //! Routines for correcting image color gamma
 CV_EXPORTS void gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward = true, Stream& stream = Stream::Null());

-//! applies fixed threshold to the image
-CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
-
 //! resizes the image
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
 CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
@@ -794,62 +615,6 @@ private:
    CannyBuf cannyBuf_;
 };

-////////////////////////////// Matrix reductions //////////////////////////////
-
-//! computes mean value and standard deviation of all or selected array elements
-//! supports only CV_8UC1 type
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
-//! buffered version
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
-
-//! computes norm of array
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports all matrices except 64F
-CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
-
-//! computes norm of the difference between two arrays
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports only CV_8UC1 type
-CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
-
-//! computes sum of array elements
-//! supports only single channel images
-CV_EXPORTS Scalar sum(const GpuMat& src);
-CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! computes sum of array elements absolute values
-//! supports only single channel images
-CV_EXPORTS Scalar absSum(const GpuMat& src);
-CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! computes squared sum of array elements
-//! supports only single channel images
-CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! finds global minimum and maximum array elements and returns their values
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
-
-//! finds global minimum and maximum array elements and returns their values with locations
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                          const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
-
-//! counts non-zero array elements
-CV_EXPORTS int countNonZero(const GpuMat& src);
-CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
-
-//! reduces a matrix to a vector
-CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
-
-
 ///////////////////////////// Calibration 3D //////////////////////////////////

 CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,

--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,24 +45,20 @@

 #include <cuda_runtime_api.h>
 #include <cufft.h>
-#include <cublas.h>
 #include "NCV.hpp"

 #if defined(__GNUC__)
    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif

 namespace cv { namespace gpu
 {
    void ncvError(int err, const char *file, const int line, const char *func = "");
    void cufftError(int err, const char *file, const int line, const char *func = "");
-    void cublasError(int err, const char *file, const int line, const char *func = "");
 }}

 static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
@@ -77,10 +73,4 @@ static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const i
        cv::gpu::cufftError(err, file, line, func);
 }

-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
-
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -142,23 +142,6 @@ namespace
    };

    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUBLAS errors
-
-    const ErrorEntry cublas_errors[] =
-    {
-        error_entry( CUBLAS_STATUS_SUCCESS ),
-        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
-        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
-        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
-        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
-        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
-        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
-        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
-    };
-
-    const int cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
 }

 namespace cv
@@ -176,12 +159,6 @@ namespace cv
            String msg = getErrorString(code, cufft_errors, cufft_error_num);
            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
        }
-
-        void cublasError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cublas_errors, cublas_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
    }
 }


--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -92,6 +92,7 @@ void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, do
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_no_cuda(); }
 void cv::gpu::CannyBuf::release() { throw_no_cuda(); }
 cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::gpu::CLAHE>(); }
+void cv::gpu::alphaComp(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

@@ -1672,4 +1673,77 @@ cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double clipLimit, cv::Size tileGrid
    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
 }

+////////////////////////////////////////////////////////////////////////
+// alphaComp
+
+namespace
+{
+    template <int DEPTH> struct NppAlphaCompFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+    };
+
+    template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = img1.cols;
+            oSizeROI.height = img2.rows;
+
+            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream)
+{
+    static const NppiAlphaOp npp_alpha_ops[] = {
+        NPPI_OP_ALPHA_OVER,
+        NPPI_OP_ALPHA_IN,
+        NPPI_OP_ALPHA_OUT,
+        NPPI_OP_ALPHA_ATOP,
+        NPPI_OP_ALPHA_XOR,
+        NPPI_OP_ALPHA_PLUS,
+        NPPI_OP_ALPHA_OVER_PREMUL,
+        NPPI_OP_ALPHA_IN_PREMUL,
+        NPPI_OP_ALPHA_OUT_PREMUL,
+        NPPI_OP_ALPHA_ATOP_PREMUL,
+        NPPI_OP_ALPHA_XOR_PREMUL,
+        NPPI_OP_ALPHA_PLUS_PREMUL,
+        NPPI_OP_ALPHA_PREMUL
+    };
+
+    typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
+
+    static const func_t funcs[] =
+    {
+        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
+    };
+
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
+
+    dst.create(img1.size(), img1.type());
+
+    const func_t func = funcs[img1.depth()];
+
+    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
+}
+
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -76,10 +76,6 @@
        #include <cufft.h>
    #endif

-    #ifdef HAVE_CUBLAS
-        #include <cublas.h>
-    #endif
-
    #include "internal_shared.hpp"
    #include "opencv2/core/stream_accessor.hpp"


--- a/modules/gpuarithm/CMakeLists.txt
+++ b/modules/gpuarithm/CMakeLists.txt
+if(ANDROID OR IOS)
+  ocv_module_disable(gpuarithm)
+endif()
+
+set(the_description "GPU-accelerated Operations on Matrices")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations)
+
+ocv_define_module(gpuarithm opencv_core)
+
+if(HAVE_CUBLAS)
+  CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
+endif()
--- a/modules/gpuarithm/doc/gpuarithm.rst
+++ b/modules/gpuarithm/doc/gpuarithm.rst
+*******************************************
+gpu. GPU-accelerated Operations on Matrices
+*******************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    operations_on_matrices
+    per_element_operations
+    matrix_reductions
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -443,3 +443,25 @@ Computes the per-element maximum of two matrices (or a matrix and a scalar).
    :param stream: Stream for the asynchronous version.

 .. seealso:: :ocv:func:`max`
+
+
+
+gpu::threshold
+------------------
+Applies a fixed-level threshold to each array element.
+
+.. ocv:function:: double gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
+
+    :param src: Source array (single-channel).
+
+    :param dst: Destination array with the same size and type as  ``src`` .
+
+    :param thresh: Threshold value.
+
+    :param maxval: Maximum value to use with  ``THRESH_BINARY`` and  ``THRESH_BINARY_INV`` threshold types.
+
+    :param type: Threshold type. For details, see  :ocv:func:`threshold` . The ``THRESH_OTSU`` threshold type is not supported.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`threshold`
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPUARITHM_HPP__
+#define __OPENCV_GPUARITHM_HPP__
+
+#include "opencv2/core/gpumat.hpp"
+
+namespace cv { namespace gpu {
+
+//! adds one matrix to another (c = a + b)
+CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! adds scalar to a matrix (c = a + s)
+CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+
+//! subtracts one matrix from another (c = a - b)
+CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+//! subtracts scalar from a matrix (c = a - s)
+CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted product of the two arrays (c = scale * a * b)
+CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! weighted multiplies matrix to a scalar (c = scale * a * s)
+CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted quotient of the two arrays (c = a / b)
+CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
+CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
+CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
+CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
+                            int dtype = -1, Stream& stream = Stream::Null());
+
+//! adds scaled array to another one (dst = alpha*src1 + src2)
+static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
+{
+    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
+}
+
+//! computes element-wise absolute difference of two arrays (c = abs(a - b))
+CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
+//! computes element-wise absolute difference of array and scalar (c = abs(a - s))
+CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
+
+//! computes absolute value of each matrix element
+//! supports CV_16S and CV_32F depth
+CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes square of each pixel in an image
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes square root of each pixel in an image
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes exponent of each matrix element (b = e**a)
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+
+//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
+CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
+
+//! computes power of each matrix element:
+//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
+//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
+//! supports all, except depth == CV_64F
+CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! compares elements of two arrays (c = a <cmpop> b)
+CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+
+//! performs per-elements bit-wise inversion
+CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise disjunction of two arrays
+CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise disjunction of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise conjunction of two arrays
+CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise conjunction of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise "exclusive or" operation
+CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
+//! calculates per-element bit-wise "exclusive or" of array and scalar
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel right shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with integers elements
+CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel left shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes per-element minimum of array and scalar (dst = min(src1, src2))
+CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of array and scalar (dst = max(src1, src2))
+CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! implements generalized matrix product algorithm GEMM from BLAS
+CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
+    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
+
+//! transposes the matrix
+//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
+CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! reverses the order of the rows, columns or both in a matrix
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
+CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
+
+//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
+//! destination array will have the depth type as lut and the same channels number as source
+//! supports CV_8UC1, CV_8UC3 types
+CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! makes multi-channel array out of several single-channel arrays
+CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! makes multi-channel array out of several single-channel arrays
+CV_EXPORTS void merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
+
+//! copies each plane of a multi-channel array to a dedicated array
+CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
+
+//! copies each plane of a multi-channel array to a dedicated array
+CV_EXPORTS void split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
+
+//! computes magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitude(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
+
+//! computes squared magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitudeSqr(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
+
+//! computes magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
+
+//! computes squared magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
+
+//! computes angle (angle(i)) of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! converts Cartesian coordinates to polar
+//! supports only floating-point source
+CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! converts polar coordinates to Cartesian
+//! supports only floating-point source
+CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
+                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
+CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
+                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+
+//! computes mean value and standard deviation of all or selected array elements
+//! supports only CV_8UC1 type
+CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
+//! buffered version
+CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+
+//! computes norm of array
+//! supports NORM_INF, NORM_L1, NORM_L2
+//! supports all matrices except 64F
+CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
+CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
+CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
+
+//! computes norm of the difference between two arrays
+//! supports NORM_INF, NORM_L1, NORM_L2
+//! supports only CV_8UC1 type
+CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
+
+//! computes sum of array elements
+//! supports only single channel images
+CV_EXPORTS Scalar sum(const GpuMat& src);
+CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+
+//! computes sum of array elements absolute values
+//! supports only single channel images
+CV_EXPORTS Scalar absSum(const GpuMat& src);
+CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+
+//! computes squared sum of array elements
+//! supports only single channel images
+CV_EXPORTS Scalar sqrSum(const GpuMat& src);
+CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
+CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
+
+//! finds global minimum and maximum array elements and returns their values
+CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
+CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
+
+//! finds global minimum and maximum array elements and returns their values with locations
+CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
+                          const GpuMat& mask=GpuMat());
+CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
+
+//! counts non-zero array elements
+CV_EXPORTS int countNonZero(const GpuMat& src);
+CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
+
+//! reduces a matrix to a vector
+CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
+
+//! applies fixed threshold to the image
+CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
+
+}} // namespace cv { namespace gpu {
+
+#endif /* __OPENCV_GPUARITHM_HPP__ */
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpuarithm/perf/perf_main.cpp
+++ b/modules/gpuarithm/perf/perf_main.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_MAIN(gpuarithm, printCudaInfo())
--- a/modules/gpuarithm/perf/perf_precomp.cpp
+++ b/modules/gpuarithm/perf/perf_precomp.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
--- a/modules/gpuarithm/perf/perf_precomp.hpp
+++ b/modules/gpuarithm/perf/perf_precomp.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
+
+#include "opencv2/core.hpp"
+#include "opencv2/gpuarithm.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -66,6 +66,61 @@ void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const
 ////////////////////////////////////////////////////////////////////////
 // gemm

+#ifdef HAVE_CUBLAS
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    const ErrorEntry cublas_errors[] =
+    {
+        error_entry( CUBLAS_STATUS_SUCCESS ),
+        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
+        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
+        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
+        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
+        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
+        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
+        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
+    };
+
+    const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
+
+    static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func)
+    {
+        if (CUBLAS_STATUS_SUCCESS != err)
+        {
+            size_t idx = std::find_if(cublas_errors, cublas_errors + cublas_error_num, ErrorEntryComparer(err)) - cublas_errors;
+
+            const char* msg = (idx != cublas_error_num) ? cublas_errors[idx].str : "Unknown error code";
+            String str = cv::format("%s [Code = %d]", msg, err);
+
+            cv::error(cv::Error::GpuApiCallError, str, func, file, line);
+        }
+    }
+}
+
+#if defined(__GNUC__)
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, "")
+#endif
+
+#endif
+
 void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
 {
 #ifndef HAVE_CUBLAS
@@ -200,9 +255,14 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 ////////////////////////////////////////////////////////////////////////
 // transpose

+namespace arithm
+{
+    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
+}
+
 void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
 {
-    CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);
+    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );

    dst.create( src.cols, src.rows, src.type() );

@@ -218,35 +278,21 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)

        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
    else if (src.elemSize() == 4)
    {
-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
+        arithm::transpose<int>(src, dst, stream);
    }
    else // if (src.elemSize() == 8)
    {
        if (!deviceSupports(NATIVE_DOUBLE))
            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");

-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
+        arithm::transpose<double>(src, dst, stream);
    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
 }

 ////////////////////////////////////////////////////////////////////////

--- a/modules/gpuarithm/src/cuda/absdiff_mat.cu
+++ b/modules/gpuarithm/src/cuda/absdiff_mat.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace arithm
+{
+    struct VAbsDiff4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff4(a, b);
+        }
+
+        __device__ __forceinline__ VAbsDiff4() {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+    };
+
+    struct VAbsDiff2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff2(a, b);
+        }
+
+        __device__ __forceinline__ VAbsDiff2() {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+    };
+
+    __device__ __forceinline__ int _abs(int a)
+    {
+        return ::abs(a);
+    }
+    __device__ __forceinline__ float _abs(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ double _abs(double a)
+    {
+        return ::fabs(a);
+    }
+
+    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return saturate_cast<T>(_abs(a - b));
+        }
+
+        __device__ __forceinline__ AbsDiffMat() {}
+        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        cudev::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
+    }
+
+    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        cudev::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
+    }
+
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+    }
+
+    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/absdiff_scalar.cu
+++ b/modules/gpuarithm/src/cuda/absdiff_scalar.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace arithm
+{
+    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    {
+        S val;
+
+        explicit AbsDiffScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ T operator ()(T a) const
+        {
+            abs_func<S> f;
+            return saturate_cast<T>(f(a - val));
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    {
+        AbsDiffScalar<T, S> op(static_cast<S>(val));
+
+        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
+    }
+
+    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/add_mat.cu
+++ b/modules/gpuarithm/src/cuda/add_mat.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace arithm
+{
+    struct VAdd4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd4(a, b);
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+    };
+
+    struct VAdd2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd2(a, b);
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+    };
+
+    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a + b);
+        }
+
+        __device__ __forceinline__ AddMat() {}
+        __device__ __forceinline__ AddMat(const AddMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        cudev::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
+    }
+
+    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        cudev::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+        else
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+    }
+
+    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/add_scalar.cu
+++ b/modules/gpuarithm/src/cuda/add_scalar.cu
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+    {
+        S val;
+
+        explicit AddScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a + val);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        AddScalar<T, S, D> op(static_cast<S>(val));
+
+        if (mask.data)
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+        else
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/gpuarithm/src/cuda/add_weighted.cu
+++ b/modules/gpuarithm/src/cuda/add_weighted.cu
--- a/modules/gpuarithm/src/cuda/arithm_func_traits.hpp
+++ b/modules/gpuarithm/src/cuda/arithm_func_traits.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __ARITHM_FUNC_TRAITS_HPP__
+#define __ARITHM_FUNC_TRAITS_HPP__
+
+#include <cstddef>
+
+namespace arithm
+{
+    template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 1 };
+    };
+
+    template <> struct ArithmFuncTraits<1, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<2, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<4, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}
+
+#endif // __ARITHM_FUNC_TRAITS_HPP__
--- a/modules/gpuarithm/src/cuda/bitwise_mat.cu
+++ b/modules/gpuarithm/src/cuda/bitwise_mat.cu
--- a/modules/gpuarithm/src/cuda/bitwise_scalar.cu
+++ b/modules/gpuarithm/src/cuda/bitwise_scalar.cu
--- a/modules/gpuarithm/src/cuda/cmp_mat.cu
+++ b/modules/gpuarithm/src/cuda/cmp_mat.cu
--- a/modules/gpuarithm/src/cuda/cmp_scalar.cu
+++ b/modules/gpuarithm/src/cuda/cmp_scalar.cu
--- a/modules/gpuarithm/src/cuda/countnonzero.cu
+++ b/modules/gpuarithm/src/cuda/countnonzero.cu
--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ b/modules/gpuarithm/src/cuda/div_inv.cu
--- a/modules/gpuarithm/src/cuda/div_mat.cu
+++ b/modules/gpuarithm/src/cuda/div_mat.cu
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
--- a/modules/gpuarithm/src/cuda/math.cu
+++ b/modules/gpuarithm/src/cuda/math.cu
--- a/modules/gpuarithm/src/cuda/minmax.cu
+++ b/modules/gpuarithm/src/cuda/minmax.cu
--- a/modules/gpuarithm/src/cuda/minmax_mat.cu
+++ b/modules/gpuarithm/src/cuda/minmax_mat.cu
--- a/modules/gpuarithm/src/cuda/minmaxloc.cu
+++ b/modules/gpuarithm/src/cuda/minmaxloc.cu
--- a/modules/gpuarithm/src/cuda/mul_mat.cu
+++ b/modules/gpuarithm/src/cuda/mul_mat.cu
--- a/modules/gpuarithm/src/cuda/mul_scalar.cu
+++ b/modules/gpuarithm/src/cuda/mul_scalar.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
--- a/modules/gpuarithm/src/cuda/reduce.cu
+++ b/modules/gpuarithm/src/cuda/reduce.cu
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpuarithm/src/cuda/sub_mat.cu
+++ b/modules/gpuarithm/src/cuda/sub_mat.cu
--- a/modules/gpuarithm/src/cuda/sub_scalar.cu
+++ b/modules/gpuarithm/src/cuda/sub_scalar.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpuarithm/src/cuda/threshold.cu
+++ b/modules/gpuarithm/src/cuda/threshold.cu
--- a/modules/gpuarithm/src/cuda/transpose.cu
+++ b/modules/gpuarithm/src/cuda/transpose.cu
--- a/modules/gpuarithm/src/cuda/unroll_detail.hpp
+++ b/modules/gpuarithm/src/cuda/unroll_detail.hpp
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpuarithm/src/precomp.cpp
+++ b/modules/gpuarithm/src/precomp.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
--- a/modules/gpuarithm/test/test_main.cpp
+++ b/modules/gpuarithm/test/test_main.cpp
--- a/modules/gpuarithm/test/test_precomp.cpp
+++ b/modules/gpuarithm/test/test_precomp.cpp
--- a/modules/gpuarithm/test/test_precomp.hpp
+++ b/modules/gpuarithm/test/test_precomp.hpp
--- a/modules/stitching/CMakeLists.txt
+++ b/modules/stitching/CMakeLists.txt
 set(the_description "Images stitching")
-ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_nonfree)
+ocv_define_module(stitching opencv_imgproc opencv_features2d opencv_calib3d opencv_objdetect OPTIONAL opencv_gpu opencv_gpuarithm opencv_nonfree)

--- a/modules/superres/CMakeLists.txt
+++ b/modules/superres/CMakeLists.txt
@@ -4,4 +4,4 @@ endif()

 set(the_description "Super Resolution")
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 -Wundef)
-ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_gpu opencv_highgui opencv_gpucodec)
+ocv_define_module(superres opencv_imgproc opencv_video OPTIONAL opencv_highgui opencv_gpu opencv_gpucodec opencv_gpuarithm)
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
--- a/samples/gpu/CMakeLists.txt
+++ b/samples/gpu/CMakeLists.txt