gpu device layer code refactoring

3ab2728d · Vladislav Vinogradov · fa0daa48 · 3ab2728d · 3ab2728d · 3ab2728d
28 changed file
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -23,7 +23,9 @@ source_group("Include" FILES ${lib_hdrs})

 #file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
 file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
+file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
+source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})

 if (HAVE_CUDA)
    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
@@ -83,7 +85,7 @@ foreach(d ${DEPS})
 	endif()
 endforeach()

-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})

 # For dynamic link numbering convenions
 set_target_properties(${the_target} PROPERTIES

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -41,7 +41,7 @@
 //M*/

 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"

 using namespace cv::gpu;
@@ -565,7 +565,7 @@ namespace cv { namespace gpu { namespace bfmatcher
        
        int myBestTrainIdx = -1;
        int myBestImgIdx = -1;
-        typename Dist::ResultType myMin = numeric_limits_gpu<typename Dist::ResultType>::max();
+        typename Dist::ResultType myMin = numeric_limits<typename Dist::ResultType>::max();

        {
            typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace bfmatcher
        {
            const T* trainDescs = trainDescs_.ptr(trainIdx);

-            typename Dist::ResultType myDist = numeric_limits_gpu<typename Dist::ResultType>::max();
+            typename Dist::ResultType myDist = numeric_limits<typename Dist::ResultType>::max();

            if (mask(queryIdx, trainIdx))
            {
@@ -932,7 +932,7 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        const int tid = threadIdx.x;
        
-        T myMin = numeric_limits_gpu<T>::max();
+        T myMin = numeric_limits<T>::max();
        int myMinIdx = -1;

        for (int i = tid; i < n; i += BLOCK_SIZE)
@@ -1007,10 +1007,10 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (threadIdx.x == 0)
        {
            float dist = sdist[0];
-            if (dist < numeric_limits_gpu<float>::max())
+            if (dist < numeric_limits<float>::max())
            {
                int bestIdx = strainIdx[0];
-                allDist[bestIdx] = numeric_limits_gpu<float>::max();
+                allDist[bestIdx] = numeric_limits<float>::max();
                trainIdx[i] = bestIdx;
                distance[i] = dist;
            }

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -40,9 +40,10 @@
 //
 //M*/

-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"

@@ -354,114 +355,11 @@ namespace cv { namespace gpu { namespace mathfunc

    //////////////////////////////////////////////////////////////////////////
    // min/max
-
-    struct MinOp
-    {        
-        template <typename T>
-        __device__ __forceinline__ T operator()(T a, T b)
-        {
-            return min(a, b);
-        }
-        __device__ __forceinline__ float operator()(float a, float b)
-        {
-            return fmin(a, b);
-        }
-        __device__ __forceinline__ double operator()(double a, double b)
-        {
-            return fmin(a, b);
-        }
-    };
-
-    struct MaxOp
-    {        
-        template <typename T>
-        __device__ __forceinline__ T operator()(T a, T b)
-        {
-            return max(a, b);
-        }
-        __device__ __forceinline__ float operator()(float a, float b)
-        {
-            return fmax(a, b);
-        }
-        __device__ __forceinline__ double operator()(double a, double b)
-        {
-            return fmax(a, b);
-        }
-    };
-    
-    template <typename T> struct ScalarMinOp
-    {
-        T s;
-
-        explicit ScalarMinOp(T s_) : s(s_) {}
-
-        __device__ __forceinline__ T operator()(T a)
-        {
-            return min(a, s);
-        }
-    };
-    template <> struct ScalarMinOp<float>
-    {
-        float s;
-
-        explicit ScalarMinOp(float s_) : s(s_) {}
-
-        __device__ __forceinline__ float operator()(float a)
-        {
-            return fmin(a, s);
-        }
-    };
-    template <> struct ScalarMinOp<double>
-    {
-        double s;
-
-        explicit ScalarMinOp(double s_) : s(s_) {}
-
-        __device__ __forceinline__ double operator()(double a)
-        {
-            return fmin(a, s);
-        }
-    };
-    
-    template <typename T> struct ScalarMaxOp
-    {
-        T s;
-
-        explicit ScalarMaxOp(T s_) : s(s_) {}
-
-        __device__ __forceinline__ T operator()(T a)
-        {
-            return max(a, s);
-        }
-    };
-    template <> struct ScalarMaxOp<float>
-    {
-        float s;
-
-        explicit ScalarMaxOp(float s_) : s(s_) {}
-
-        __device__ __forceinline__ float operator()(float a)
-        {
-            return fmax(a, s);
-        }
-    };
-    template <> struct ScalarMaxOp<double>
-    {
-        double s;
-
-        explicit ScalarMaxOp(double s_) : s(s_) {}
-
-        __device__ __forceinline__ double operator()(double a)
-        {
-            return fmax(a, s);
-        }
-    };
    
    template <typename T>
    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
-        MinOp op;
-        transform(src1, src2, dst, op, stream);    
+        transform(src1, src2, dst, minimum<T>(), stream);    
    }

    template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@@ -475,8 +373,7 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
-        MaxOp op;
-        transform(src1, src2, dst, op, stream);    
+        transform(src1, src2, dst, maximum<T>(), stream);    
    }
    
    template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@@ -490,8 +387,7 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
-        ScalarMinOp<T> op(src2);
-        transform(src1, dst, op, stream);    
+        transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    
    }

    template void min_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
@@ -501,12 +397,11 @@ namespace cv { namespace gpu { namespace mathfunc
    template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
    template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
    template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-    
+
    template <typename T>
    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
-        ScalarMaxOp<T> op(src2);
-        transform(src1, dst, op, stream);    
+        transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    
    }

    template void max_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
@@ -519,100 +414,7 @@ namespace cv { namespace gpu { namespace mathfunc

    
    //////////////////////////////////////////////////////////////////////////
-    // threshold
-
-    template <typename T> struct ThreshBinary
-    {
-        ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? maxVal : 0;
-        }
-
-    private:
-        T thresh;
-        T maxVal;
-    };
-
-    template <typename T> struct ThreshBinaryInv
-    {
-        ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? 0 : maxVal;
-        }
-
-    private:
-        T thresh;
-        T maxVal;
-    };
-
-    template <typename T> struct ThreshTrunc
-    {
-        ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return min(src, thresh);
-        }
-
-    private:
-        T thresh;
-    };
-    template <> struct  ThreshTrunc<float>
-    {
-        ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
-
-        __device__ __forceinline__ float operator()(const float& src) const
-        {
-            return fmin(src, thresh);
-        }
-
-    private:
-        float thresh;
-    };
-    template <> struct  ThreshTrunc<double>
-    {
-        ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
-
-        __device__ __forceinline__ double operator()(const double& src) const
-        {
-            return fmin(src, thresh);
-        }
-
-    private:
-        double thresh;
-    };
-
-    template <typename T> struct ThreshToZero
-    {
-    public:
-        ThreshToZero(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? src : 0;
-        }
-
-    private:
-        T thresh;
-    };
-
-    template <typename T> struct ThreshToZeroInv
-    {
-    public:
-        ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? 0 : src;
-        }
-
-    private:
-        T thresh;
-    };
+    // threshold  

    template <template <typename> class Op, typename T>
    void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
@@ -631,11 +433,11 @@ namespace cv { namespace gpu { namespace mathfunc

        static const caller_t callers[] = 
        {
-            threshold_caller<ThreshBinary, T>, 
-            threshold_caller<ThreshBinaryInv, T>, 
-            threshold_caller<ThreshTrunc, T>, 
-            threshold_caller<ThreshToZero, T>, 
-            threshold_caller<ThreshToZeroInv, T>
+            threshold_caller<thresh_binary_func, T>, 
+            threshold_caller<thresh_binary_inv_func, T>, 
+            threshold_caller<thresh_trunc_func, T>, 
+            threshold_caller<thresh_to_zero_func, T>, 
+            threshold_caller<thresh_to_zero_inv_func, T>
        };

        callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
@@ -653,20 +455,10 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // subtract

-    template <typename T>
-    class SubtractOp
-    {
-    public:
-        __device__ __forceinline__ T operator()(const T& l, const T& r) const
-        {
-            return l - r;
-        }
-    };
-
    template <typename T>
    void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
    {
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, SubtractOp<T>(), stream);
+        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
    }

    template void subtractCaller<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
@@ -675,7 +467,7 @@ namespace cv { namespace gpu { namespace mathfunc
    //////////////////////////////////////////////////////////////////////////
    // pow
    
-    template<typename T, bool Signed = device::numeric_limits_gpu<T>::is_signed>
+    template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
    struct PowOp
    {    
        float power;
@@ -695,7 +487,7 @@ namespace cv { namespace gpu { namespace mathfunc

        __device__ __forceinline__ float operator()(const T& e)
        {
-              T res = saturate_cast<T>(__powf((float)e, power));            
+            T res = saturate_cast<T>(__powf((float)e, power));            
            
            if ( (e < 0) && (1 & (int)power) )
                    res *= -1;            

--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -42,8 +42,8 @@

 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

 #include "safe_call.hpp"
@@ -76,7 +76,7 @@ namespace filter_krnls
 {
    template <typename T, size_t size> struct SmemType_
    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
    };
    template <typename T> struct SmemType_<T, 4>
    {
@@ -111,7 +111,7 @@ namespace filter_krnls

            if (x < src.cols)
            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
                sum_t sum = VecTraits<sum_t>::all(0);

                sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
@@ -253,7 +253,7 @@ namespace filter_krnls

            if (y < src.rows)
            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
                sum_t sum = VecTraits<sum_t>::all(0);

                sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
@@ -475,7 +475,7 @@ namespace bf_krnls
                    }
                }

-                float minimum = numeric_limits_gpu<float>::max();
+                float minimum = numeric_limits<float>::max();
                int id = 0;

                if (cost[0] < minimum)

--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -42,6 +42,7 @@
 //M*/

 #include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

 using namespace cv::gpu;
@@ -50,14 +51,11 @@ using namespace cv::gpu::device;

 #define UINT_BITS 32U

-#define LOG2_WARP_SIZE 5U
-#define WARP_SIZE (1U << LOG2_WARP_SIZE)
-
 //Warps == subhistograms per threadblock
 #define WARP_COUNT 6

 //Threadblock size
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)
+#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
 #define HISTOGRAM256_BIN_COUNT 256

 //Shared memory per threadblock
@@ -73,7 +71,7 @@ namespace cv { namespace gpu { namespace histograms
 {
    #if (!USE_SMEM_ATOMICS)

-        #define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )
+        #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )

        __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
        {
@@ -111,7 +109,7 @@ namespace cv { namespace gpu { namespace histograms
    {
        //Per-warp subhistogram storage
        __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-        uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+        uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;

        //Clear shared memory storage for current threadblock before processing
        #pragma unroll
@@ -119,7 +117,7 @@ namespace cv { namespace gpu { namespace histograms
           s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;

        //Cycle through the entire data set, update subhistograms for each warp
-        const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);
+        const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);

        __syncthreads();
        const uint colsui = d_Data.step / sizeof(uint);

--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -41,7 +41,7 @@
 //M*/

 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -84,8 +84,8 @@ __global__ void matchTemplateNaiveKernel_CCORR(
        int w, int h, const PtrStep image, const PtrStep templ, 
        DevMem2Df result)
 {
-    typedef typename TypeVec<T, cn>::vec_t Type;
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<T, cn>::vec_type Type;
+    typedef typename TypeVec<float, cn>::vec_type Typef;

    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -174,8 +174,8 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
        int w, int h, const PtrStep image, const PtrStep templ, 
        DevMem2Df result)
 {
-    typedef typename TypeVec<T, cn>::vec_t Type;
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<T, cn>::vec_type Type;
+    typedef typename TypeVec<float, cn>::vec_type Typef;

    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -884,7 +884,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
 template <int cn>
 __global__ void extractFirstChannel_32F(const PtrStep image, DevMem2Df result)
 {
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<float, cn>::vec_type Typef;

    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -40,9 +40,9 @@
 //
 //M*/

-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"


--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -40,9 +40,9 @@
 //
 //M*/

-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"

@@ -190,8 +190,8 @@ namespace cv { namespace gpu { namespace mathfunc
        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
        uint tid = threadIdx.y * blockDim.x + threadIdx.x;

-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
+        T mymin = numeric_limits<T>::max();
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
        for (uint y = y0; y < y_end; y += blockDim.y)
@@ -512,9 +512,9 @@ namespace cv { namespace gpu { namespace mathfunc
        uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
        uint tid = threadIdx.y * blockDim.x + threadIdx.x;

-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : 
-                                                     numeric_limits_gpu<T>::min(); 
+        T mymin = numeric_limits<T>::max();
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : 
+                                                     numeric_limits<T>::min(); 
        uint myminloc = 0;
        uint mymaxloc = 0;
        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
@@ -1094,10 +1094,10 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
+    __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_type* result)
    {
-        typedef typename TypeVec<T, 2>::vec_t SrcType;
-        typedef typename TypeVec<R, 2>::vec_t DstType;
+        typedef typename TypeVec<T, 2>::vec_type SrcType;
+        typedef typename TypeVec<R, 2>::vec_type DstType;

        __shared__ R smem[nthreads * 2];

@@ -1173,9 +1173,9 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
    {
-        typedef typename TypeVec<R, 2>::vec_t DstType;
+        typedef typename TypeVec<R, 2>::vec_type DstType;

        __shared__ R smem[nthreads * 2];

@@ -1199,10 +1199,10 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
+    __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_type* result)
    {
-        typedef typename TypeVec<T, 3>::vec_t SrcType;
-        typedef typename TypeVec<R, 3>::vec_t DstType;
+        typedef typename TypeVec<T, 3>::vec_type SrcType;
+        typedef typename TypeVec<R, 3>::vec_type DstType;

        __shared__ R smem[nthreads * 3];

@@ -1285,9 +1285,9 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
    {
-        typedef typename TypeVec<R, 3>::vec_t DstType;
+        typedef typename TypeVec<R, 3>::vec_type DstType;

        __shared__ R smem[nthreads * 3];

@@ -1313,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
    }

    template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
+    __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_type* result)
    {
-        typedef typename TypeVec<T, 4>::vec_t SrcType;
-        typedef typename TypeVec<R, 4>::vec_t DstType;
+        typedef typename TypeVec<T, 4>::vec_type SrcType;
+        typedef typename TypeVec<R, 4>::vec_type DstType;

        __shared__ R smem[nthreads * 4];

@@ -1407,9 +1407,9 @@ namespace cv { namespace gpu { namespace mathfunc


    template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
    {
-        typedef typename TypeVec<R, 4>::vec_t DstType;
+        typedef typename TypeVec<R, 4>::vec_type DstType;

        __shared__ R smem[nthreads * 4];

@@ -1454,41 +1454,41 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 2:
            sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 3:
            sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 4:
            sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
@@ -1526,19 +1526,19 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            break;
        case 2:
            sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            break;
        case 3:
            sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            break;
        case 4:
            sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            break;
        }
        cudaSafeCall( cudaGetLastError() );
@@ -1576,41 +1576,41 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 2:
            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 3:
            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 4:
            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
@@ -1648,19 +1648,19 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            break;
        case 2:
            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            break;
        case 3:
            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            break;
        case 4:
            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            break;
        }
        cudaSafeCall( cudaGetLastError() );
@@ -1698,41 +1698,41 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 2:
            sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 3:
            sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
        case 4:
            sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            cudaSafeCall( cudaGetLastError() );

            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
            cudaSafeCall( cudaGetLastError() );

            break;
@@ -1770,19 +1770,19 @@ namespace cv { namespace gpu { namespace mathfunc
        {
        case 1:
            sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
            break;
        case 2:
            sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
            break;
        case 3:
            sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
            break;
        case 4:
            sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
            break;
        }
        cudaSafeCall( cudaGetLastError() );

--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -42,7 +42,7 @@

 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "safe_call.hpp"

 using namespace cv::gpu;
@@ -381,7 +381,7 @@ namespace cv { namespace gpu { namespace bp
    template <typename T>
    __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
    {
-        float minimum = numeric_limits_gpu<float>::max();
+        float minimum = numeric_limits<float>::max();

        for(int i = 0; i < cndisp; ++i)
        {
@@ -486,7 +486,7 @@ namespace cv { namespace gpu { namespace bp
            size_t disp_step = disp.rows * u.step;

            int best = 0;
-            float best_val = numeric_limits_gpu<float>::max();
+            float best_val = numeric_limits<float>::max();
            for (int d = 0; d < cndisp; ++d)
            {
                float val  = us[d * disp_step];

--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -42,7 +42,7 @@

 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "safe_call.hpp"

 using namespace cv::gpu;
@@ -147,7 +147,7 @@ namespace cv { namespace gpu { namespace csbp

            for(int i = 0; i < nr_plane; i++)
            {
-                T minimum = numeric_limits_gpu<T>::max();
+                T minimum = numeric_limits<T>::max();
                int id = 0;
                for(int d = 0; d < cndisp; d++)
                {
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace csbp

                data_cost_selected[i  * cdisp_step1] = minimum;
                selected_disparity[i  * cdisp_step1] = id;
-                data_cost         [id * cdisp_step1] = numeric_limits_gpu<T>::max();
+                data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
            }
        }
    }
@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace csbp
                    data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
                    selected_disparity[nr_local_minimum * cdisp_step1] = d;

-                    data_cost[d * cdisp_step1] = numeric_limits_gpu<T>::max();
+                    data_cost[d * cdisp_step1] = numeric_limits<T>::max();

                    nr_local_minimum++;
                }
@@ -203,7 +203,7 @@ namespace cv { namespace gpu { namespace csbp

            for (int i = nr_local_minimum; i < nr_plane; i++)
            {
-                T minimum = numeric_limits_gpu<T>::max();
+                T minimum = numeric_limits<T>::max();
                int id = 0;

                for (int d = 0; d < cndisp; d++)
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace csbp
                data_cost_selected[i * cdisp_step1] = minimum;
                selected_disparity[i * cdisp_step1] = id;

-                data_cost[id * cdisp_step1] = numeric_limits_gpu<T>::max();
+                data_cost[id * cdisp_step1] = numeric_limits<T>::max();
            }
        }
    }
@@ -610,7 +610,7 @@ namespace cv { namespace gpu { namespace csbp
    {
        for(int i = 0; i < nr_plane; i++)
        {
-            T minimum = numeric_limits_gpu<T>::max();
+            T minimum = numeric_limits<T>::max();
            int id = 0;
            for(int j = 0; j < nr_plane2; j++)
            {
@@ -630,7 +630,7 @@ namespace cv { namespace gpu { namespace csbp
            l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
            r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];

-            data_cost_new[id * cdisp_step1] = numeric_limits_gpu<T>::max();
+            data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
        }
    }

@@ -737,7 +737,7 @@ namespace cv { namespace gpu { namespace csbp
    __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
                                      const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
    {
-        T minimum = numeric_limits_gpu<T>::max();
+        T minimum = numeric_limits<T>::max();

        for(int d = 0; d < nr_plane; d++)
        {
@@ -850,7 +850,7 @@ namespace cv { namespace gpu { namespace csbp
            const T* r = r_ + (y+0) * cmsg_step1 + (x-1);

            int best = 0;
-            T best_val = numeric_limits_gpu<T>::max();
+            T best_val = numeric_limits<T>::max();
            for (int i = 0; i < nr_plane; ++i)
            {
                int idx = i * cdisp_step1;

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -46,8 +46,10 @@
 //M*/

 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -393,31 +395,10 @@ namespace cv { namespace gpu { namespace surf
            //dss
            H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];

-            float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
-              - H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
-              + H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
+            __shared__ float x[3];

-            if (det != 0.0f)
+            if (solve3x3(H, dD, x))
            {
-                float invdet = 1.0f / det;
-
-                __shared__ float x[3];
-
-                x[0] = invdet * 
-                    (dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
-                     H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
-                     H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
-
-                x[1] = invdet * 
-                    (H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
-                     dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
-                     H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
-
-                x[2] = invdet * 
-                    (H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
-                     H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
-                     dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
-
                if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
                {
                    // if the step is within the interpolation region, perform it
@@ -500,20 +481,6 @@ namespace cv { namespace gpu { namespace surf
    __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
    __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};

-    __device__ void reduceSum32(volatile float* v_sum, float& sum)
-    {
-        v_sum[threadIdx.x] = sum;
-
-        if (threadIdx.x < 16)
-        {
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];
-        }
-    }
-
    __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
    {        
        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@@ -599,8 +566,11 @@ namespace cv { namespace gpu { namespace surf

                float* s_sum_row = s_sum + threadIdx.y * 32;

-                reduceSum32(s_sum_row, sumx);
-                reduceSum32(s_sum_row, sumy);
+                //reduceSum32(s_sum_row, sumx);
+                //reduceSum32(s_sum_row, sumy);
+
+                warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
+                warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());

                const float temp_mod = sumx * sumx + sumy * sumy;
                if (temp_mod > best_mod)

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -43,8 +43,8 @@
 #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
 #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__

-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"

 namespace cv { namespace gpu { namespace device
 {
@@ -72,64 +72,53 @@ namespace cv { namespace gpu { namespace device
            return -last <= mini && maxi <= 2 * last;
        }

-    private:
        int last;
    };

-
-    template <typename D>
-    struct BrdRowReflect101: BrdReflect101
+    template <typename D> struct BrdRowReflect101 : BrdReflect101
    {
        explicit BrdRowReflect101(int len): BrdReflect101(len) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return saturate_cast<D>(data[idx_low(i)]);
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return saturate_cast<D>(data[idx_high(i)]);
        }
    };

-
-    template <typename D>
-    struct BrdColReflect101: BrdReflect101
+    template <typename D> struct BrdColReflect101 : BrdReflect101
    {
        BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

-    private:
        int step;
    };

-
    struct BrdReplicate
    {
        explicit BrdReplicate(int len): last(len - 1) {}

        __device__ __forceinline__ int idx_low(int i) const
        {
-            return max(i, 0);
+            return ::max(i, 0);
        }

        __device__ __forceinline__ int idx_high(int i) const 
        {
-            return min(i, last);
+            return ::min(i, last);
        }

        __device__ __forceinline__ int idx(int i) const
@@ -142,64 +131,52 @@ namespace cv { namespace gpu { namespace device
            return true;
        }

-    private:
        int last;
    };

-
-    template <typename D>
-    struct BrdRowReplicate: BrdReplicate
+    template <typename D> struct BrdRowReplicate : BrdReplicate
    {
        explicit BrdRowReplicate(int len): BrdReplicate(len) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return saturate_cast<D>(data[idx_low(i)]);
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return saturate_cast<D>(data[idx_high(i)]);
        }
    };


-    template <typename D>
-    struct BrdColReplicate: BrdReplicate
+    template <typename D> struct BrdColReplicate : BrdReplicate
    {
        BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
        }

-    private:
        int step;
    };

-    template <typename D>
-    struct BrdRowConstant
+    template <typename D> struct BrdRowConstant
    {
        explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return i >= 0 ? saturate_cast<D>(data[i]) : val;
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return i < len ? saturate_cast<D>(data[i]) : val;
        }
@@ -209,24 +186,20 @@ namespace cv { namespace gpu { namespace device
            return true;
        }

-    private:
        int len;
        D val;
    };

-    template <typename D>
-    struct BrdColConstant
+    template <typename D> struct BrdColConstant
    {
        BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}

-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
        {
            return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
        }

-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
        {
            return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
        }
@@ -236,15 +209,12 @@ namespace cv { namespace gpu { namespace device
            return true;
        }

-    private:
        int len;
        int step;
        D val;
    };

-
-    template <typename OutT>
-    struct BrdConstant
+    template <typename OutT> struct BrdConstant
    {
        BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}

@@ -255,11 +225,9 @@ namespace cv { namespace gpu { namespace device
            return val;
        }

-    private:
        int w, h;
        OutT val;
    };
-
 }}}

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COLOR_HPP__
+#define __OPENCV_GPU_COLOR_HPP__
+
+#include "detail/color.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+    
+#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+    
+#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
+
+#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+}}}
+
+#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -44,6 +44,7 @@
 #define __OPENCV_GPU_DATAMOV_UTILS_HPP__

 #include "internal_shared.hpp"
+#include "utility.hpp"

 namespace cv { namespace gpu { namespace device
 {
@@ -55,49 +56,40 @@ namespace cv { namespace gpu { namespace device
            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
        };
            
-    #else // __CUDA_ARCH__ >= 200
-
-        #if defined(_WIN64) || defined(__LP64__)		
-            // 64-bit register modifier for inlined asm
-            #define _OPENCV_ASM_PTR_ "l"
-        #else	
-            // 32-bit register modifier for inlined asm
-            #define _OPENCV_ASM_PTR_ "r"
-        #endif
+    #else // __CUDA_ARCH__ >= 200        

        template<class T> struct ForceGlob;

-        #define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod)                                                   \
-        template <> struct ForceGlob<base_type>                                                                   \
-        {                                                                                                         \
-            __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)         \
-            {                                                                                                     \
-                asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset));       \
-            }                                                                                                     \
-        };
-        #define DEFINE_FORCE_GLOB_B(base_type, ptx_type)                                                                          \
-        template <> struct ForceGlob<base_type>                                                                                   \
-        {                                                                                                                         \
-            __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)                         \
-            {                                                                                                                     \
-                asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : _OPENCV_ASM_PTR_(ptr + offset));   \
-            }                                                                                                                     \
-        };
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
        
-        DEFINE_FORCE_GLOB_B(uchar,  u8)
-        DEFINE_FORCE_GLOB_B(schar,  s8)
-        DEFINE_FORCE_GLOB_B(char,   b8)
-        DEFINE_FORCE_GLOB  (ushort, u16, h)
-        DEFINE_FORCE_GLOB  (short,  s16, h)
-        DEFINE_FORCE_GLOB  (uint,   u32, r)
-        DEFINE_FORCE_GLOB  (int,    s32, r)	
-        DEFINE_FORCE_GLOB  (float,  f32, f)	
-        DEFINE_FORCE_GLOB  (double, f64, d)	
-            
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            

-    #undef DEFINE_FORCE_GLOB
-    #undef DEFINE_FORCE_GLOB_B
-    #undef _OPENCV_ASM_PTR_
+    #undef OPENCV_GPU_DEFINE_FORCE_GLOB
+    #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
        
    #endif // __CUDA_ARCH__ >= 200
 }}}

--- a/modules/gpu/src/opencv2/gpu/device/detail/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
--- a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
@@ -45,7 +45,7 @@

 namespace cv { namespace gpu { namespace device
 {
-    template<class T> struct numeric_limits_gpu
+    template<class T> struct numeric_limits
    {
        typedef T type;
        __device__ __forceinline__ static type min()  { return type(); };
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed;
    };

-    template<> struct numeric_limits_gpu<bool>
+    template<> struct numeric_limits<bool>
    {
        typedef bool type;
        __device__ __forceinline__ static type min() { return false; };
@@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = false;
    };

-    template<> struct numeric_limits_gpu<char>
+    template<> struct numeric_limits<char>
    {
        typedef char type;
        __device__ __forceinline__ static type min() { return CHAR_MIN; };
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = (char)-1 == -1;
    };

-     template<> struct numeric_limits_gpu<signed char>
+     template<> struct numeric_limits<signed char>
    {
        typedef char type;
        __device__ __forceinline__ static type min() { return CHAR_MIN; };
@@ -101,7 +101,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = (signed char)-1 == -1;
    };

-    template<> struct numeric_limits_gpu<unsigned char>
+    template<> struct numeric_limits<unsigned char>
    {
        typedef unsigned char type;
        __device__ __forceinline__ static type min() { return 0; };
@@ -115,7 +115,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = false;
    };

-    template<> struct numeric_limits_gpu<short>
+    template<> struct numeric_limits<short>
    {
        typedef short type;
        __device__ __forceinline__ static type min() { return SHRT_MIN; };
@@ -129,7 +129,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = true;
    };

-    template<> struct numeric_limits_gpu<unsigned short>
+    template<> struct numeric_limits<unsigned short>
    {
        typedef unsigned short type;
        __device__ __forceinline__ static type min() { return 0; };
@@ -143,7 +143,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = false;
    };

-    template<> struct numeric_limits_gpu<int>
+    template<> struct numeric_limits<int>
    {
        typedef int type;
        __device__ __forceinline__ static type min() { return INT_MIN; };
@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
    };


-    template<> struct numeric_limits_gpu<unsigned int>
+    template<> struct numeric_limits<unsigned int>
    {
        typedef unsigned int type;
        __device__ __forceinline__ static type min() { return 0; };
@@ -172,7 +172,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = false;
    };

-    template<> struct numeric_limits_gpu<long>
+    template<> struct numeric_limits<long>
    {
        typedef long type;
        __device__ __forceinline__ static type min() { return LONG_MIN; };
@@ -186,7 +186,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = true;
    };

-    template<> struct numeric_limits_gpu<unsigned long>
+    template<> struct numeric_limits<unsigned long>
    {
        typedef unsigned long type;
        __device__ __forceinline__ static type min() { return 0; };
@@ -200,7 +200,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = false;
    };

-    template<> struct numeric_limits_gpu<float>
+    template<> struct numeric_limits<float>
    {
        typedef float type;
        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
@@ -214,7 +214,7 @@ namespace cv { namespace gpu { namespace device
        static const bool is_signed = true;
    };

-    template<> struct numeric_limits_gpu<double>
+    template<> struct numeric_limits<double>
    {
        typedef double type;
        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };

--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
+#define __OPENCV_GPU_VEC_TRAITS_HPP__
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
+
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+
+#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_GPU_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ type all(type v) {return v;} \
+        static __device__ __host__ type make(type x) {return x;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+    };
+
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(char)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+
+#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<schar> 
+    { 
+        typedef schar elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ schar all(schar v) {return v;}
+        static __device__ __host__ schar make(schar x) {return x;}
+    };
+}}}
+
+#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp