diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 84dfb856c39c04aee10a2507214ba6b2ae9874ed..eaa47a0f3f9f52b0699d0c79bff860fb7b40816b 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -23,7 +23,9 @@ source_group("Include" FILES ${lib_hdrs})
 
 #file(GLOB lib_device_hdrs "include/opencv2/${name}/device/*.h*")
 file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
+file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
+source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})
 
 if (HAVE_CUDA)
     file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
@@ -83,7 +85,7 @@ foreach(d ${DEPS})
 	endif()
 endforeach()
 
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})
 
 # For dynamic link numbering convenions
 set_target_properties(${the_target} PROPERTIES
diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index 4d9bd50167b4d7e7df1a453a80b8ce64b6ce1984..c539f3a6da1e27e24695f26113f822b9ebf8b066 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -53,410 +53,1376 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(
 
 namespace cv { namespace gpu {  namespace color  
 {
-    void RGB2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-    void RGB2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-    void RGB2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+        void name(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+}}}
 
-    void RGB5x52RGB_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-    void RGB2RGB5x5_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int green_bits, int bidx, cudaStream_t stream);
+namespace
+{
+    typedef void (*gpu_func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
 
-    void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-    void Gray2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-    void Gray2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-    void Gray2RGB5x5_gpu(const DevMem2D& src, const DevMem2D& dst, int green_bits, cudaStream_t stream);
+    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
 
-    void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-    void RGB2Gray_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-    void RGB2Gray_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-    void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream);
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));        
 
-    void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
-    void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
-    void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
 
-    void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
-    void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
-    void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream);
+    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
 
-    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
-    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
-    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
+    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
 
-    void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
-    void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
-    void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));        
 
-    void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
-    void RGB2HSV_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
 
-    void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
-    void HSV2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
+    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
 
-    void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
-    void RGB2HLS_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));        
 
-    void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
-    void HLS2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream);
-}}}
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
 
-namespace
-{
-    #undef R2Y
-    #undef G2Y
-    #undef B2Y
+    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);        
+
+        color::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);        
+
+        color::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);        
+
+        color::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC3);        
+
+        color::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);        
+
+        color::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);        
+
+        color::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);        
+
+        color::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC4);        
+
+        color::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr555(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void gray_to_bgr565(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 1);
+
+        dst.create(src.size(), CV_8UC2);        
+
+        color::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr555_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC1);        
+
+        color::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr565_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {        
+        CV_Assert(src.depth() == CV_8U);
+        CV_Assert(src.channels() == 2);
+
+        dst.create(src.size(), CV_8UC1);        
+
+        color::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));        
+
+        funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
     
-    enum
-    {
-        yuv_shift  = 14,
-        xyz_shift  = 12,
-        R2Y        = 4899,
-        G2Y        = 9617,
-        B2Y        = 1868,
-        BLOCK_SIZE = 256
-    };
-}
+    void rgb_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
+                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
+            },
+            {
+                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
+                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
 
-namespace
-{
-    void cvtColor_caller(const GpuMat& src, GpuMat& dst, int code, int dcn, const cudaStream_t& stream) 
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_yuv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
+                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
+            },
+            {
+                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
+                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void yuv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
+                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
+            },
+            {
+                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
+                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void yuv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
+                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
+            },
+            {
+                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
+                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+    
+    void rgb_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
+                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
+            },
+            {
+                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
+                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_YCrCb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
+                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
+            },
+            {
+                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
+                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void YCrCb_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
+                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
+            },
+            {
+                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
+                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void YCrCb_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
+                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
+            },
+            {
+                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
+                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
+                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
+            },
+            {
+                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
+                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_xyz(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
+                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
+            },
+            {
+                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
+                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void xyz_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
+                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
+            },
+            {
+                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
+                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void xyz_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
+                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
+            },
+            {
+                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
+                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void rgb_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_hsv_8u, 0, 0, 0, 0, rgb_to_hsv_32f},
+                {rgba_to_hsv_8u, 0, 0, 0, 0, rgba_to_hsv_32f},
+            },
+            {
+                {rgb_to_hsv4_8u, 0, 0, 0, 0, rgb_to_hsv4_32f},
+                {rgba_to_hsv4_8u, 0, 0, 0, 0, rgba_to_hsv4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hsv(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_hsv_8u, 0, 0, 0, 0, bgr_to_hsv_32f},
+                {bgra_to_hsv_8u, 0, 0, 0, 0, bgra_to_hsv_32f}
+            },
+            {
+                {bgr_to_hsv4_8u, 0, 0, 0, 0, bgr_to_hsv4_32f},
+                {bgra_to_hsv4_8u, 0, 0, 0, 0, bgra_to_hsv4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hsv_to_rgb_8u, 0, 0, 0, 0, hsv_to_rgb_32f},
+                {hsv4_to_rgb_8u, 0, 0, 0, 0, hsv4_to_rgb_32f}
+            },
+            {
+                {hsv_to_rgba_8u, 0, 0, 0, 0, hsv_to_rgba_32f},
+                {hsv4_to_rgba_8u, 0, 0, 0, 0, hsv4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hsv_to_bgr_8u, 0, 0, 0, 0, hsv_to_bgr_32f},
+                {hsv4_to_bgr_8u, 0, 0, 0, 0, hsv4_to_bgr_32f}
+            },
+            {
+                {hsv_to_bgra_8u, 0, 0, 0, 0, hsv_to_bgra_32f},
+                {hsv4_to_bgra_8u, 0, 0, 0, 0, hsv4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }    
+
+    void rgb_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_hls_8u, 0, 0, 0, 0, rgb_to_hls_32f},
+                {rgba_to_hls_8u, 0, 0, 0, 0, rgba_to_hls_32f},
+            },
+            {
+                {rgb_to_hls4_8u, 0, 0, 0, 0, rgb_to_hls4_32f},
+                {rgba_to_hls4_8u, 0, 0, 0, 0, rgba_to_hls4_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hls(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_hls_8u, 0, 0, 0, 0, bgr_to_hls_32f},
+                {bgra_to_hls_8u, 0, 0, 0, 0, bgra_to_hls_32f}
+            },
+            {
+                {bgr_to_hls4_8u, 0, 0, 0, 0, bgr_to_hls4_32f},
+                {bgra_to_hls4_8u, 0, 0, 0, 0, bgra_to_hls4_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_rgb(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hls_to_rgb_8u, 0, 0, 0, 0, hls_to_rgb_32f},
+                {hls4_to_rgb_8u, 0, 0, 0, 0, hls4_to_rgb_32f}
+            },
+            {
+                {hls_to_rgba_8u, 0, 0, 0, 0, hls_to_rgba_32f},
+                {hls4_to_rgba_8u, 0, 0, 0, 0, hls4_to_rgba_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hls_to_bgr_8u, 0, 0, 0, 0, hls_to_bgr_32f},
+                {hls4_to_bgr_8u, 0, 0, 0, 0, hls4_to_bgr_32f}
+            },
+            {
+                {hls_to_bgra_8u, 0, 0, 0, 0, hls_to_bgra_32f},
+                {hls4_to_bgra_8u, 0, 0, 0, 0, hls4_to_bgra_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }   
+
+    void rgb_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_hsv_full_8u, 0, 0, 0, 0, rgb_to_hsv_full_32f},
+                {rgba_to_hsv_full_8u, 0, 0, 0, 0, rgba_to_hsv_full_32f},
+            },
+            {
+                {rgb_to_hsv4_full_8u, 0, 0, 0, 0, rgb_to_hsv4_full_32f},
+                {rgba_to_hsv4_full_8u, 0, 0, 0, 0, rgba_to_hsv4_full_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hsv_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {bgr_to_hsv_full_8u, 0, 0, 0, 0, bgr_to_hsv_full_32f},
+                {bgra_to_hsv_full_8u, 0, 0, 0, 0, bgra_to_hsv_full_32f}
+            },
+            {
+                {bgr_to_hsv4_full_8u, 0, 0, 0, 0, bgr_to_hsv4_full_32f},
+                {bgra_to_hsv4_full_8u, 0, 0, 0, 0, bgra_to_hsv4_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hsv_to_rgb_full_8u, 0, 0, 0, 0, hsv_to_rgb_full_32f},
+                {hsv4_to_rgb_full_8u, 0, 0, 0, 0, hsv4_to_rgb_full_32f}
+            },
+            {
+                {hsv_to_rgba_full_8u, 0, 0, 0, 0, hsv_to_rgba_full_32f},
+                {hsv4_to_rgba_full_8u, 0, 0, 0, 0, hsv4_to_rgba_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hsv_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hsv_to_bgr_full_8u, 0, 0, 0, 0, hsv_to_bgr_full_32f},
+                {hsv4_to_bgr_full_8u, 0, 0, 0, 0, hsv4_to_bgr_full_32f}
+            },
+            {
+                {hsv_to_bgra_full_8u, 0, 0, 0, 0, hsv_to_bgra_full_32f},
+                {hsv4_to_bgra_full_8u, 0, 0, 0, 0, hsv4_to_bgra_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }    
+
+    void rgb_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
     {
-        Size sz = src.size();
-        int scn = src.channels(), depth = src.depth(), bidx;
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {rgb_to_hls_full_8u, 0, 0, 0, 0, rgb_to_hls_full_32f},
+                {rgba_to_hls_full_8u, 0, 0, 0, 0, rgba_to_hls_full_32f},
+            },
+            {
+                {rgb_to_hls4_full_8u, 0, 0, 0, 0, rgb_to_hls4_full_32f},
+                {rgba_to_hls4_full_8u, 0, 0, 0, 0, rgba_to_hls4_full_32f},
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
         
-        CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
 
-        switch (code)
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void bgr_to_hls_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
         {
-            case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
-            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:                
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {color::RGB2RGB_gpu_8u, 0, color::RGB2RGB_gpu_16u, 0, 0, color::RGB2RGB_gpu_32f};
-
-                    CV_Assert(scn == 3 || scn == 4);
-
-                    dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
-                    bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
-                    
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    funcs[depth](src, scn, dst, dcn, bidx, stream);
-                    break;
-                }
-                
-            case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
-            case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
-                {
-                    CV_Assert((scn == 3 || scn == 4) && depth == CV_8U);
-
-                    int green_bits = code == CV_BGR2BGR565 || code == CV_RGB2BGR565 
-                        || code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5;
-                    bidx = code == CV_BGR2BGR565 || code == CV_BGR2BGR555 
-                        || code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2;
-
-                    dst.create(sz, CV_8UC2);
-
-                    color::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
-                    break;
-                }
-            
-            case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
-            case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
-                {
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
-
-                    int green_bits = code == CV_BGR5652BGR || code == CV_BGR5652RGB 
-                        || code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5;
-                    bidx = code == CV_BGR5652BGR || code == CV_BGR5552BGR 
-                        || code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2;
-
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    color::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
-                    break;
-                }
-                        
-            case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {color::RGB2Gray_gpu_8u, 0, color::RGB2Gray_gpu_16u, 0, 0, color::RGB2Gray_gpu_32f};
-
-                    CV_Assert(scn == 3 || scn == 4);
-                    
-                    bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
-
-                    dst.create(sz, CV_MAKETYPE(depth, 1));
-
-                    funcs[depth](src, scn, dst, bidx, stream);
-                    break;
-                }
-            
-            case CV_BGR5652GRAY: case CV_BGR5552GRAY:
-                {
-                    CV_Assert(scn == 2 && depth == CV_8U);
-
-                    int green_bits = code == CV_BGR5652GRAY ? 6 : 5;
-
-                    dst.create(sz, CV_8UC1);
-
-                    color::RGB5x52Gray_gpu(src, green_bits, dst, stream);
-                    break;
-                }
-            
-            case CV_GRAY2BGR: case CV_GRAY2BGRA:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-                    static const func_t funcs[] = {color::Gray2RGB_gpu_8u, 0, color::Gray2RGB_gpu_16u, 0, 0, color::Gray2RGB_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert(scn == 1 && (dcn == 3 || dcn == 4));
-
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    funcs[depth](src, dst, dcn, stream);
-                    break;
-                }
-                
-            case CV_GRAY2BGR565: case CV_GRAY2BGR555:
-                {
-                    CV_Assert(scn == 1 && depth == CV_8U);
-
-                    int green_bits =  code == CV_GRAY2BGR565 ? 6 : 5;
-
-                    dst.create(sz, CV_8UC2);
-                    
-                    color::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
-                    break;
-                }
-
-            case CV_BGR2YCrCb: case CV_RGB2YCrCb:
-            case CV_BGR2YUV: case CV_RGB2YUV:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
-                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {color::RGB2YCrCb_gpu_8u, 0, color::RGB2YCrCb_gpu_16u, 0, 0, color::RGB2YCrCb_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
-
-                    bidx = code == CV_BGR2YCrCb || code == CV_RGB2YUV ? 0 : 2;
-
-                    static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
-                    static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
-
-                    static const float YCrCb_f[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
-                    static const int YCrCb_i[] = {R2Y, G2Y, B2Y, 11682, 9241};
-
-                    float coeffs_f[5];
-                    int coeffs_i[5];
-                    ::memcpy(coeffs_f, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_f : yuv_f, sizeof(yuv_f));
-                    ::memcpy(coeffs_i, code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? YCrCb_i : yuv_i, sizeof(yuv_i));
-
-                    if (bidx == 0) 
-                    {
-                        std::swap(coeffs_f[0], coeffs_f[2]);
-                        std::swap(coeffs_i[0], coeffs_i[2]);
-                    }
-                        
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
-
-                    funcs[depth](src, scn, dst, dcn, bidx, coeffs, stream);
-                    break;
-                }
-                
-            case CV_YCrCb2BGR: case CV_YCrCb2RGB:
-            case CV_YUV2BGR: case CV_YUV2RGB:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
-                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {color::YCrCb2RGB_gpu_8u, 0, color::YCrCb2RGB_gpu_16u, 0, 0, color::YCrCb2RGB_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
-
-                    bidx = code == CV_YCrCb2BGR || code == CV_YUV2RGB ? 0 : 2;
-
-                    static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
-                    static const int yuv_i[] = { 33292, -6472, -9519, 18678 }; 
-
-                    static const float YCrCb_f[] = {1.403f, -0.714f, -0.344f, 1.773f};
-                    static const int YCrCb_i[] = {22987, -11698, -5636, 29049};
-
-                    const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? YCrCb_f : yuv_f;
-                    const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? YCrCb_i : yuv_i;
-                    
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-                    
-                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
-
-                    funcs[depth](src, scn, dst, dcn, bidx, coeffs, stream);
-                    break;
-                }
-            
-            case CV_BGR2XYZ: case CV_RGB2XYZ:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
-                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {color::RGB2XYZ_gpu_8u, 0, color::RGB2XYZ_gpu_16u, 0, 0, color::RGB2XYZ_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
-
-                    bidx = code == CV_BGR2XYZ ? 0 : 2;
-
-                    static const float RGB2XYZ_D65f[] =
-                    {
-                        0.412453f, 0.357580f, 0.180423f,
-                        0.212671f, 0.715160f, 0.072169f,
-                        0.019334f, 0.119193f, 0.950227f
-                    };
-                    static const int RGB2XYZ_D65i[] =
-                    {
-                        1689,    1465,    739,
-                        871,     2929,    296,
-                        79,      488,     3892
-                    };
-
-                    float coeffs_f[9];
-                    int coeffs_i[9];
-                    ::memcpy(coeffs_f, RGB2XYZ_D65f, sizeof(RGB2XYZ_D65f));
-                    ::memcpy(coeffs_i, RGB2XYZ_D65i, sizeof(RGB2XYZ_D65i));
-
-                    if (bidx == 0) 
-                    {
-                        std::swap(coeffs_f[0], coeffs_f[2]);
-                        std::swap(coeffs_f[3], coeffs_f[5]);
-                        std::swap(coeffs_f[6], coeffs_f[8]);
-                        
-                        std::swap(coeffs_i[0], coeffs_i[2]);
-                        std::swap(coeffs_i[3], coeffs_i[5]);
-                        std::swap(coeffs_i[6], coeffs_i[8]);
-                    }
-                        
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-                    
-                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
-                    
-                    funcs[depth](src, scn, dst, dcn, coeffs, stream);
-                    break;
-                }
-            
-            case CV_XYZ2BGR: case CV_XYZ2RGB:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {color::XYZ2RGB_gpu_8u, 0, color::XYZ2RGB_gpu_16u, 0, 0, color::XYZ2RGB_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
-
-                    bidx = code == CV_XYZ2BGR ? 0 : 2;
-
-                    static const float XYZ2sRGB_D65f[] =
-                    {
-                        3.240479f, -1.53715f, -0.498535f,
-                        -0.969256f, 1.875991f, 0.041556f,
-                        0.055648f, -0.204043f, 1.057311f
-                    };
-                    static const int XYZ2sRGB_D65i[] =
-                    {
-                        13273,  -6296,  -2042,
-                        -3970,   7684,    170,
-                          228,   -836,   4331
-                    };
-
-                    float coeffs_f[9];
-                    int coeffs_i[9];
-                    ::memcpy(coeffs_f, XYZ2sRGB_D65f, sizeof(XYZ2sRGB_D65f));
-                    ::memcpy(coeffs_i, XYZ2sRGB_D65i, sizeof(XYZ2sRGB_D65i));
-
-                    if (bidx == 0) 
-                    {
-                        std::swap(coeffs_f[0], coeffs_f[6]);
-                        std::swap(coeffs_f[1], coeffs_f[7]);
-                        std::swap(coeffs_f[2], coeffs_f[8]);
-                        
-                        std::swap(coeffs_i[0], coeffs_i[6]);
-                        std::swap(coeffs_i[1], coeffs_i[7]);
-                        std::swap(coeffs_i[2], coeffs_i[8]);
-                    }
-                        
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-                    
-                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
-
-                    funcs[depth](src, scn, dst, dcn, coeffs, stream);
-                    break;
-                }
-
-            case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
-            case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
-                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {color::RGB2HSV_gpu_8u, 0, 0, 0, 0, color::RGB2HSV_gpu_32f};
-                    static const func_t funcs_hls[] = {color::RGB2HLS_gpu_8u, 0, 0, 0, 0, color::RGB2HLS_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
-
-                    bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
-                        code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
-                    int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
-                        code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
-                
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    if (code == CV_BGR2HSV || code == CV_RGB2HSV || code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL) 
-                        funcs_hsv[depth](src, scn, dst, dcn, bidx, hrange, stream);
-                    else
-                        funcs_hls[depth](src, scn, dst, dcn, bidx, hrange, stream);
-                    break;
-                }
-
-            case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
-            case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
-                {
-                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
-                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {color::HSV2RGB_gpu_8u, 0, 0, 0, 0, color::HSV2RGB_gpu_32f};
-                    static const func_t funcs_hls[] = {color::HLS2RGB_gpu_8u, 0, 0, 0, 0, color::HLS2RGB_gpu_32f};
-
-                    if (dcn <= 0) dcn = 3;
-
-                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
-
-                    bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
-                        code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
-                    int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
-                        code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
-                    
-                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-
-                    if (code == CV_HSV2BGR || code == CV_HSV2RGB || code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL)
-                        funcs_hsv[depth](src, scn, dst, dcn, bidx, hrange, stream);
-                    else
-                        funcs_hls[depth](src, scn, dst, dcn, bidx, hrange, stream);
-                    break;
-                }
-
-            default:
-                CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
-        }
+            {
+                {bgr_to_hls_full_8u, 0, 0, 0, 0, bgr_to_hls_full_32f},
+                {bgra_to_hls_full_8u, 0, 0, 0, 0, bgra_to_hls_full_32f}
+            },
+            {
+                {bgr_to_hls4_full_8u, 0, 0, 0, 0, bgr_to_hls4_full_32f},
+                {bgra_to_hls4_full_8u, 0, 0, 0, 0, bgra_to_hls4_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_rgb_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hls_to_rgb_full_8u, 0, 0, 0, 0, hls_to_rgb_full_32f},
+                {hls4_to_rgb_full_8u, 0, 0, 0, 0, hls4_to_rgb_full_32f}
+            },
+            {
+                {hls_to_rgba_full_8u, 0, 0, 0, 0, hls_to_rgba_full_32f},
+                {hls4_to_rgba_full_8u, 0, 0, 0, 0, hls4_to_rgba_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
+    }
+
+    void hls_to_bgr_full(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream)
+    {
+        using namespace cv::gpu::color;
+        static const gpu_func_t funcs[2][2][6] = 
+        {
+            {
+                {hls_to_bgr_full_8u, 0, 0, 0, 0, hls_to_bgr_full_32f},
+                {hls4_to_bgr_full_8u, 0, 0, 0, 0, hls4_to_bgr_full_32f}
+            },
+            {
+                {hls_to_bgra_full_8u, 0, 0, 0, 0, hls_to_bgra_full_32f},
+                {hls4_to_bgra_full_8u, 0, 0, 0, 0, hls4_to_bgra_full_32f}
+            }
+        };
+
+        if (dcn <= 0) dcn = 3;
+        
+        CV_Assert(src.depth() == CV_8U || src.depth() == CV_32F);
+        CV_Assert(src.channels() == 3 || src.channels() == 4);
+        CV_Assert(dcn == 3 || dcn == 4);
+
+        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));        
+
+        funcs[dcn == 4][src.channels() == 4][src.depth()](src, dst, StreamAccessor::getStream(stream));
     }
 }
 
 void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream& stream)
 {
-    cvtColor_caller(src, dst, code, dcn, StreamAccessor::getStream(stream));
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int dcn, Stream& stream);
+    static const func_t funcs[] = 
+    {
+        bgr_to_bgra,            // CV_BGR2BGRA    =0
+        bgra_to_bgr,            // CV_BGRA2BGR    =1
+        bgr_to_rgba,            // CV_BGR2RGBA    =2
+        bgra_to_rgb,            // CV_RGBA2BGR    =3
+        bgr_to_rgb,             // CV_BGR2RGB     =4
+        bgra_to_rgba,           // CV_BGRA2RGBA   =5
+
+        bgr_to_gray,            // CV_BGR2GRAY    =6
+        rgb_to_gray,            // CV_RGB2GRAY    =7
+        gray_to_bgr,            // CV_GRAY2BGR    =8
+        gray_to_bgra,           // CV_GRAY2BGRA   =9
+        bgra_to_gray,           // CV_BGRA2GRAY   =10
+        rgba_to_gray,           // CV_RGBA2GRAY   =11
+
+        bgr_to_bgr565,          // CV_BGR2BGR565  =12
+        rgb_to_bgr565,          // CV_RGB2BGR565  =13
+        bgr565_to_bgr,          // CV_BGR5652BGR  =14
+        bgr565_to_rgb,          // CV_BGR5652RGB  =15
+        bgra_to_bgr565,         // CV_BGRA2BGR565 =16
+        rgba_to_bgr565,         // CV_RGBA2BGR565 =17
+        bgr565_to_bgra,         // CV_BGR5652BGRA =18
+        bgr565_to_rgba,         // CV_BGR5652RGBA =19
+
+        gray_to_bgr565,         // CV_GRAY2BGR565 =20
+        bgr565_to_gray,         // CV_BGR5652GRAY =21
+
+        bgr_to_bgr555,          // CV_BGR2BGR555  =22
+        rgb_to_bgr555,          // CV_RGB2BGR555  =23
+        bgr555_to_bgr,          // CV_BGR5552BGR  =24
+        bgr555_to_rgb,          // CV_BGR5552RGB  =25
+        bgra_to_bgr555,         // CV_BGRA2BGR555 =26
+        rgba_to_bgr555,         // CV_RGBA2BGR555 =27
+        bgr555_to_bgra,         // CV_BGR5552BGRA =28
+        bgr555_to_rgba,         // CV_BGR5552RGBA =29
+
+        gray_to_bgr555,         // CV_GRAY2BGR555 =30
+        bgr555_to_gray,         // CV_BGR5552GRAY =31
+
+        bgr_to_xyz,             // CV_BGR2XYZ     =32
+        rgb_to_xyz,             // CV_RGB2XYZ     =33
+        xyz_to_bgr,             // CV_XYZ2BGR     =34
+        xyz_to_rgb,             // CV_XYZ2RGB     =35
+
+        bgr_to_YCrCb,           // CV_BGR2YCrCb   =36
+        rgb_to_YCrCb,           // CV_RGB2YCrCb   =37
+        YCrCb_to_bgr,           // CV_YCrCb2BGR   =38
+        YCrCb_to_rgb,           // CV_YCrCb2RGB   =39
+
+        bgr_to_hsv,             // CV_BGR2HSV     =40
+        rgb_to_hsv,             // CV_RGB2HSV     =41
+
+        0,                      //                =42
+        0,                      //                =43
+
+        0,                      // CV_BGR2Lab     =44 
+        0,                      // CV_RGB2Lab     =45
+
+        0,                      // CV_BayerBG2BGR =46
+        0,                      // CV_BayerGB2BGR =47
+        0,                      // CV_BayerRG2BGR =48
+        0,                      // CV_BayerGR2BGR =49
+
+        0,                      // CV_BGR2Luv     =50
+        0,                      // CV_RGB2Luv     =51
+
+        bgr_to_hls,             // CV_BGR2HLS     =52
+        rgb_to_hls,             // CV_RGB2HLS     =53
+
+        hsv_to_bgr,             // CV_HSV2BGR     =54
+        bgr_to_rgb,             // CV_HSV2RGB     =55
+
+        0,                      // CV_Lab2BGR     =56
+        0,                      // CV_Lab2RGB     =57
+        0,                      // CV_Luv2BGR     =58
+        0,                      // CV_Luv2RGB     =59
+        
+        hls_to_bgr,             // CV_HLS2BGR     =60
+        hls_to_rgb,             // CV_HLS2RGB     =61
+
+        0,                      // CV_BayerBG2BGR_VNG =62
+        0,                      // CV_BayerGB2BGR_VNG =63
+        0,                      // CV_BayerRG2BGR_VNG =64
+        0,                      // CV_BayerGR2BGR_VNG =65
+
+        bgr_to_hsv_full,        // CV_BGR2HSV_FULL = 66
+        rgb_to_hsv_full,        // CV_RGB2HSV_FULL = 67
+        bgr_to_hls_full,        // CV_BGR2HLS_FULL = 68
+        rgb_to_hls_full,        // CV_RGB2HLS_FULL = 69
+
+        hsv_to_bgr_full,        // CV_HSV2BGR_FULL = 70
+        hsv_to_rgb_full,        // CV_HSV2RGB_FULL = 71
+        hls_to_bgr_full,        // CV_HLS2BGR_FULL = 72
+        hls_to_rgb_full,        // CV_HLS2RGB_FULL = 73
+
+        0,                      // CV_LBGR2Lab     = 74
+        0,                      // CV_LRGB2Lab     = 75
+        0,                      // CV_LBGR2Luv     = 76
+        0,                      // CV_LRGB2Luv     = 77
+
+        0,                      // CV_Lab2LBGR     = 78
+        0,                      // CV_Lab2LRGB     = 79
+        0,                      // CV_Luv2LBGR     = 80
+        0,                      // CV_Luv2LRGB     = 81
+
+        bgr_to_yuv,             // CV_BGR2YUV      = 82
+        rgb_to_yuv,             // CV_RGB2YUV      = 83
+        yuv_to_bgr,             // CV_YUV2BGR      = 84
+        yuv_to_rgb,             // CV_YUV2RGB      = 85
+
+        0,                      // CV_BayerBG2GRAY = 86
+        0,                      // CV_BayerGB2GRAY = 87
+        0,                      // CV_BayerRG2GRAY = 88
+        0,                      // CV_BayerGR2GRAY = 89
+
+        0,                      // CV_YUV420i2RGB  = 90
+        0,                      // CV_YUV420i2BGR  = 91
+        0,                      // CV_YUV420sp2RGB = 92
+        0                       // CV_YUV420sp2BGR = 93
+    };
+
+    CV_Assert(code < 94);
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
+
+    func(src, dst, dcn, stream);
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpu/src/cuda/brute_force_matcher.cu b/modules/gpu/src/cuda/brute_force_matcher.cu
index 27b8e530a4c9fc85aed1048ab2c6c5442f5e3c0b..17c5c802f7bdfffde3f25b18fae3c42dec7be52c 100644
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -41,7 +41,7 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"
 
 using namespace cv::gpu;
@@ -565,7 +565,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         
         int myBestTrainIdx = -1;
         int myBestImgIdx = -1;
-        typename Dist::ResultType myMin = numeric_limits_gpu<typename Dist::ResultType>::max();
+        typename Dist::ResultType myMin = numeric_limits<typename Dist::ResultType>::max();
 
         {
             typename Dist::ResultType* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
@@ -821,7 +821,7 @@ namespace cv { namespace gpu { namespace bfmatcher
         {
             const T* trainDescs = trainDescs_.ptr(trainIdx);
 
-            typename Dist::ResultType myDist = numeric_limits_gpu<typename Dist::ResultType>::max();
+            typename Dist::ResultType myDist = numeric_limits<typename Dist::ResultType>::max();
 
             if (mask(queryIdx, trainIdx))
             {
@@ -932,7 +932,7 @@ namespace cv { namespace gpu { namespace bfmatcher
     {
         const int tid = threadIdx.x;
         
-        T myMin = numeric_limits_gpu<T>::max();
+        T myMin = numeric_limits<T>::max();
         int myMinIdx = -1;
 
         for (int i = tid; i < n; i += BLOCK_SIZE)
@@ -1007,10 +1007,10 @@ namespace cv { namespace gpu { namespace bfmatcher
         if (threadIdx.x == 0)
         {
             float dist = sdist[0];
-            if (dist < numeric_limits_gpu<float>::max())
+            if (dist < numeric_limits<float>::max())
             {
                 int bestIdx = strainIdx[0];
-                allDist[bestIdx] = numeric_limits_gpu<float>::max();
+                allDist[bestIdx] = numeric_limits<float>::max();
                 trainIdx[i] = bestIdx;
                 distance[i] = dist;
             }
diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu
index 9fcd52061926dbd223bcb29456021137f7da2db6..2755fffba8f53e002559c73b010459c2a541ec90 100644
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -41,1305 +41,168 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
 #include "opencv2/gpu/device/transform.hpp"
+#include "opencv2/gpu/device/color.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
 
-#ifndef CV_DESCALE
-#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
-#endif
-
 namespace cv { namespace gpu { namespace color
 {
-    template<typename T> struct ColorChannel
-    {
-        typedef float worktype_f;
-        static __device__ __forceinline__ T max() { return numeric_limits_gpu<T>::max(); }
-        static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
-    };
-    template<> struct ColorChannel<float>
-    {
-        typedef float worktype_f;
-        static __device__ __forceinline__ float max() { return 1.f; }
-        static __device__ __forceinline__ float half() { return 0.5f; }
-    };
-
-    template <typename T>
-    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_t& vec, T val)
-    {
-    }
-    template <typename T>
-    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_t& vec, T val)
-    {
-        vec.w = val;
-    }
-    template <typename T>
-    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_t& vec)
-    {
-        return ColorChannel<T>::max();
-    }
-    template <typename T>
-    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_t& vec)
-    {
-        return vec.w;
-    }
-
-    template <typename Cvt>
-    void callConvert(const DevMem2D& src, const DevMem2D& dst, const Cvt& cvt, cudaStream_t stream)
-    {
-        typedef typename Cvt::src_t src_t;
-        typedef typename Cvt::dst_t dst_t;
-
-        transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, cvt, stream);
-    }
-
-////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
-
-    template <typename T, int SRCCN, int DSTCN>
-    struct RGB2RGB
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        explicit RGB2RGB(int bidx) : bidx(bidx) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-
-            dst.x = (&src.x)[bidx];
-            dst.y = src.y;
-            dst.z = (&src.x)[bidx ^ 2];
-            setAlpha(dst, getAlpha<T>(src));
-
-            return dst;
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <typename T, int SRCCN, int DSTCN>
-    void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        RGB2RGB<T, SRCCN, DSTCN> cvt(bidx);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        static const RGB2RGB_caller_t RGB2RGB_callers[2][2] =
-        {
-            {RGB2RGB_caller<uchar, 3, 3>, RGB2RGB_caller<uchar, 3, 4>},
-            {RGB2RGB_caller<uchar, 4, 3>, RGB2RGB_caller<uchar, 4, 4>}
-        };
-
-        RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
-    }
-
-    void RGB2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        static const RGB2RGB_caller_t RGB2RGB_callers[2][2] =
-        {
-            {RGB2RGB_caller<ushort, 3, 3>, RGB2RGB_caller<ushort, 3, 4>},
-            {RGB2RGB_caller<ushort, 4, 3>, RGB2RGB_caller<ushort, 4, 4>}
-        };
-
-        RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
-    }
-
-    void RGB2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        static const RGB2RGB_caller_t RGB2RGB_callers[2][2] =
-        {
-            {RGB2RGB_caller<float, 3, 3>, RGB2RGB_caller<float, 3, 4>},
-            {RGB2RGB_caller<float, 4, 3>, RGB2RGB_caller<float, 4, 4>}
-        };
-
-        RGB2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
-    }
-
-/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
-
-    template <int GREEN_BITS> struct RGB5x52RGBConverter;
-    template <> struct RGB5x52RGBConverter<5>
-    {
-        template <typename D>
-        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
-        {
-            (&dst.x)[bidx] = (uchar)(src << 3);
-            dst.y = (uchar)((src >> 2) & ~7);
-            (&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7);
-            setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0));
-        }
-    };
-    template <> struct RGB5x52RGBConverter<6>
-    {
-        template <typename D>
-        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
-        {
-            (&dst.x)[bidx] = (uchar)(src << 3);
-            dst.y = (uchar)((src >> 3) & ~3);
-            (&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7);
-            setAlpha(dst, (uchar)(255));
-        }
-    };
-
-    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGB
-    {
-        typedef ushort src_t;
-        typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
-
-        explicit RGB5x52RGB(int bidx) : bidx(bidx) {}
-
-        __device__ __forceinline__ dst_t operator()(ushort src) const
-        {
-            dst_t dst;
-            RGB5x52RGBConverter<GREEN_BITS>::cvt((uint)src, dst, bidx);
-            return dst;
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <int GREEN_BITS> struct RGB2RGB5x5Converter;
-    template<> struct RGB2RGB5x5Converter<6>
-    {
-        template <typename T>
-        static __device__ __forceinline__ ushort cvt(const T& src, int bidx)
-        {
-            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
-        }
-    };
-    template<> struct RGB2RGB5x5Converter<5>
-    {
-        static __device__ __forceinline__ ushort cvt(const uchar3& src, int bidx)
-        {
-            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
-        }
-        static __device__ __forceinline__ ushort cvt(const uchar4& src, int bidx)
-        {
-            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
-        }
-    };
-
-    template<int SRCCN, int GREEN_BITS> struct RGB2RGB5x5
-    {
-        typedef typename TypeVec<uchar, SRCCN>::vec_t src_t;
-        typedef ushort dst_t;
-
-        explicit RGB2RGB5x5(int bidx) : bidx(bidx) {}
-
-        __device__ __forceinline__ ushort operator()(const src_t& src)
-        {
-            return RGB2RGB5x5Converter<GREEN_BITS>::cvt(src, bidx);
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <int GREEN_BITS, int DSTCN>
-    void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        RGB5x52RGB<GREEN_BITS, DSTCN> cvt(bidx);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB5x52RGB_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB5x52RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        static const RGB5x52RGB_caller_t RGB5x52RGB_callers[2][2] =
-        {
-            {RGB5x52RGB_caller<5, 3>, RGB5x52RGB_caller<5, 4>},
-            {RGB5x52RGB_caller<6, 3>, RGB5x52RGB_caller<6, 4>}
-        };
-
-        RGB5x52RGB_callers[green_bits - 5][dstcn - 3](src, dst, bidx, stream);
-    }
-
-    template <int SRCCN, int GREEN_BITS>
-    void RGB2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        RGB2RGB5x5<SRCCN, GREEN_BITS> cvt(bidx);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2RGB5x5_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int green_bits, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2RGB5x5_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        static const RGB2RGB5x5_caller_t RGB2RGB5x5_callers[2][2] =
-        {
-            {RGB2RGB5x5_caller<3, 5>, RGB2RGB5x5_caller<3, 6>},
-            {RGB2RGB5x5_caller<4, 5>, RGB2RGB5x5_caller<4, 6>}
-        };
-
-        RGB2RGB5x5_callers[srccn - 3][green_bits - 5](src, dst, bidx, stream);
-    }
-
-///////////////////////////////// Grayscale to Color ////////////////////////////////
-
-    template <int DSTCN, typename T> struct Gray2RGB
-    {
-        typedef T src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        __device__ __forceinline__ dst_t operator()(const T& src) const
-        {
-            dst_t dst;
-
-            dst.z = dst.y = dst.x = src;
-            setAlpha(dst, ColorChannel<T>::max());
-
-            return dst;
-        }
-    };
-
-    template <int GREEN_BITS> struct Gray2RGB5x5Converter;
-    template<> struct Gray2RGB5x5Converter<6>
-    {
-        static __device__ __forceinline__ ushort cvt(uint t)
-        {
-            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
-        }
-    };
-    template<> struct Gray2RGB5x5Converter<5>
-    {
-        static __device__ __forceinline__ ushort cvt(uint t)
-        {
-            t >>= 3;
-            return (ushort)(t | (t << 5) | (t << 10));
-        }
-    };
-
-    template<int GREEN_BITS> struct Gray2RGB5x5
-    {
-        typedef uchar src_t;
-        typedef ushort dst_t;
-
-        __device__ __forceinline__ ushort operator()(uchar src) const
-        {
-            return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
-        }
-    };
-
-    template <typename T, int DSTCN>
-    void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
-    {
-        Gray2RGB<DSTCN, T> cvt;
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
-    {
-        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<uchar, 3>, Gray2RGB_caller<uchar, 4>};
-
-        Gray2RGB_callers[dstcn - 3](src, dst, stream);
-    }
-
-    void Gray2RGB_gpu_16u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
-    {
-        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<ushort, 3>, Gray2RGB_caller<ushort, 4>};
-
-        Gray2RGB_callers[dstcn - 3](src, dst, stream);
-    }
-
-    void Gray2RGB_gpu_32f(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
-    {
-        typedef void (*Gray2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-        static const Gray2RGB_caller_t Gray2RGB_callers[] = {Gray2RGB_caller<float, 3>, Gray2RGB_caller<float, 4>};
-
-        Gray2RGB_callers[dstcn - 3](src, dst, stream);
-    }
-
-    template <int GREEN_BITS>
-    void Gray2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
-    {
-        Gray2RGB5x5<GREEN_BITS> cvt;
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void Gray2RGB5x5_gpu(const DevMem2D& src, const DevMem2D& dst, int green_bits, cudaStream_t stream)
-    {
-        typedef void (*Gray2RGB5x5_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-        static const Gray2RGB5x5_caller_t Gray2RGB5x5_callers[2] =
-        {
-            Gray2RGB5x5_caller<5>, Gray2RGB5x5_caller<6>
-        };
-
-        Gray2RGB5x5_callers[green_bits - 5](src, dst, stream);
-    }
-
-///////////////////////////////// Color to Grayscale ////////////////////////////////
-
-    #undef R2Y
-    #undef G2Y
-    #undef B2Y
-
-    enum
-    {
-        yuv_shift  = 14,
-        xyz_shift  = 12,
-        R2Y        = 4899,
-        G2Y        = 9617,
-        B2Y        = 1868,
-        BLOCK_SIZE = 256
-    };
-
-    template <int GREEN_BITS> struct RGB5x52GrayConverter;
-    template<> struct RGB5x52GrayConverter<6>
-    {
-        static __device__ __forceinline__ uchar cvt(uint t)
-        {
-            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
-        }
-    };
-    template<> struct RGB5x52GrayConverter<5>
-    {
-        static __device__ __forceinline__ uchar cvt(uint t)
-        {
-            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
-        }
-    };
-
-    template<int GREEN_BITS> struct RGB5x52Gray
-    {
-        typedef ushort src_t;
-        typedef uchar dst_t;
-
-        __device__ __forceinline__ uchar operator()(ushort src) const
-        {
-            return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
-        }
-    };
-
-    template <typename T>
-    __device__ __forceinline__ T RGB2GrayConvert(const T* src, int bidx)
-    {
-        return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
-    }
-     __device__ __forceinline__ float RGB2GrayConvert(const float* src, int bidx)
-    {
-        const float cr = 0.299f;
-        const float cg = 0.587f;
-        const float cb = 0.114f;
-
-        return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr;
-    }
-
-    template <int SRCCN, typename T> struct RGB2Gray
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef T dst_t;
-
-        explicit RGB2Gray(int bidx) : bidx(bidx) {}
-
-        __device__ __forceinline__ T operator()(const src_t& src)
-        {
-            return RGB2GrayConvert(&src.x, bidx);
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <typename T, int SRCCN>
-    void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        RGB2Gray<SRCCN, T> cvt(bidx);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<uchar, 3>, RGB2Gray_caller<uchar, 4>};
-
-        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
-    }
-
-    void RGB2Gray_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<ushort, 3>, RGB2Gray_caller<ushort, 4>};
-
-        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
-    }
-
-    void RGB2Gray_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
-    {
-        typedef void (*RGB2Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
-        RGB2Gray_caller_t RGB2Gray_callers[] = {RGB2Gray_caller<float, 3>, RGB2Gray_caller<float, 4>};
-
-        RGB2Gray_callers[srccn - 3](src, dst, bidx, stream);
-    }
-
-    template <int GREEN_BITS>
-    void RGB5x52Gray_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
-    {
-        RGB5x52Gray<GREEN_BITS> cvt;
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream)
-    {
-        typedef void (*RGB5x52Gray_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-        static const RGB5x52Gray_caller_t RGB5x52Gray_callers[2] =
-        {
-            RGB5x52Gray_caller<5>, RGB5x52Gray_caller<6>
-        };
-
-        RGB5x52Gray_callers[green_bits - 5](src, dst, stream);
-    }
-
-///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
-
-    __constant__ int cYCrCbCoeffs_i[5];
-    __constant__ float cYCrCbCoeffs_f[5];
-
-    template <typename T, typename D>
-    __device__ __forceinline__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
-    {
-        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
-
-        const int Y = CV_DESCALE(src[0] * cYCrCbCoeffs_i[0] + src[1] * cYCrCbCoeffs_i[1] + src[2] * cYCrCbCoeffs_i[2], yuv_shift);
-        const int Cr = CV_DESCALE((src[bidx^2] - Y) * cYCrCbCoeffs_i[3] + delta, yuv_shift);
-        const int Cb = CV_DESCALE((src[bidx] - Y) * cYCrCbCoeffs_i[4] + delta, yuv_shift);
-
-        dst.x = saturate_cast<T>(Y);
-        dst.y = saturate_cast<T>(Cr);
-        dst.z = saturate_cast<T>(Cb);
-    }
-    template <typename D>
-    static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
-    {
-        dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2];
-        dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel<float>::half();
-        dst.z = (src[bidx] - dst.x) * cYCrCbCoeffs_f[4] + ColorChannel<float>::half();
-    }
-
-    template<typename T> struct RGB2YCrCbBase
-    {
-        typedef int coeff_t;
-
-        explicit RGB2YCrCbBase(const coeff_t coeffs[5])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
-        }
-    };
-    template<> struct RGB2YCrCbBase<float>
-    {
-        typedef float coeff_t;
-
-        explicit RGB2YCrCbBase(const coeff_t coeffs[5])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
-        }
-    };
-    template <int SRCCN, int DSTCN, typename T> struct RGB2YCrCb : RGB2YCrCbBase<T>
-    {
-        typedef typename RGB2YCrCbBase<T>::coeff_t coeff_t;
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase<T>(coeffs), bidx(bidx) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            RGB2YCrCbConvert(&src.x, dst, bidx);
-            return dst;
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <typename T, typename D>
-    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
-    {
-        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[3], yuv_shift);
-        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[1], yuv_shift);
-        const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[0], yuv_shift);
-
-        dst[bidx] = saturate_cast<D>(b);
-        dst[1] = saturate_cast<D>(g);
-        dst[bidx^2] = saturate_cast<D>(r);
-    }
-    template <typename T>
-    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
-    {
-        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[3];
-        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[1];
-        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[0];
-    }
-
-    template<typename T> struct YCrCb2RGBBase
-    {
-        typedef int coeff_t;
-
-        explicit YCrCb2RGBBase(const coeff_t coeffs[4])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
-        }
-    };
-    template<> struct YCrCb2RGBBase<float>
-    {
-        typedef float coeff_t;
-
-        explicit YCrCb2RGBBase(const coeff_t coeffs[4])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
-        }
-    };
-    template <int SRCCN, int DSTCN, typename T> struct YCrCb2RGB : YCrCb2RGBBase<T>
-    {
-        typedef typename YCrCb2RGBBase<T>::coeff_t coeff_t;
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase<T>(coeffs), bidx(bidx) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-
-            YCrCb2RGBConvert(src, &dst.x, bidx);
-            setAlpha(dst, ColorChannel<T>::max());
-
-            return dst;
-        }
-
-    private:
-        int bidx;
-    };
-
-    template <typename T, int SRCCN, int DSTCN>
-    void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef typename RGB2YCrCb<SRCCN, DSTCN, T>::coeff_t coeff_t;
-        RGB2YCrCb<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
-        {
-            {RGB2YCrCb_caller<uchar, 3, 3>, RGB2YCrCb_caller<uchar, 3, 4>},
-            {RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
-        };
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-    void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
-        {
-            {RGB2YCrCb_caller<ushort, 3, 3>, RGB2YCrCb_caller<ushort, 3, 4>},
-            {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
-        };
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-    void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] =
-        {
-            {RGB2YCrCb_caller<float, 3, 3>, RGB2YCrCb_caller<float, 3, 4>},
-            {RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
-        };
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-    template <typename T, int SRCCN, int DSTCN>
-    void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef typename YCrCb2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
-        YCrCb2RGB<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
-        {
-            {YCrCb2RGB_caller<uchar, 3, 3>, YCrCb2RGB_caller<uchar, 3, 4>},
-            {YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
-        };
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-    void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
-        {
-            {YCrCb2RGB_caller<ushort, 3, 3>, YCrCb2RGB_caller<ushort, 3, 4>},
-            {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
-        };
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-    void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
-        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] =
-        {
-            {YCrCb2RGB_caller<float, 3, 3>, YCrCb2RGB_caller<float, 3, 4>},
-            {YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
-        };
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
-    }
-
-////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
-
-    __constant__ int cXYZ_D65i[9];
-    __constant__ float cXYZ_D65f[9];
-
-    template <typename T, typename D>
-    __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
-    {
-        dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
-        dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
-        dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
-    }
-    template <typename D>
-    __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
-    {
-        dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
-        dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
-        dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8];
-    }
-
-    template <typename T> struct RGB2XYZBase
-    {
-        typedef int coeff_t;
-
-        explicit RGB2XYZBase(const coeff_t coeffs[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-        }
-    };
-    template <> struct RGB2XYZBase<float>
-    {
-        typedef float coeff_t;
-
-        explicit RGB2XYZBase(const coeff_t coeffs[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
-        }
-    };
-    template <int SRCCN, int DSTCN, typename T> struct RGB2XYZ : RGB2XYZBase<T>
-    {
-        typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase<T>(coeffs) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            RGB2XYZConvert(&src.x, dst);
-            return dst;
-        }
-    };
-
-    template <typename T, typename D>
-    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
-    {
-        dst[0] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
-            dst[1] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
-            dst[2] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
-    }
-    template <typename T>
-    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
-    {
-        dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
-            dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
-            dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8];
-    }
-
-    template <typename T> struct XYZ2RGBBase
-    {
-        typedef int coeff_t;
-
-        explicit XYZ2RGBBase(const coeff_t coeffs[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-        }
-    };
-    template <> struct XYZ2RGBBase<float>
-    {
-        typedef float coeff_t;
-
-        explicit XYZ2RGBBase(const coeff_t coeffs[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
-        }
-    };
-    template <int SRCCN, int DSTCN, typename T> struct XYZ2RGB : XYZ2RGBBase<T>
-    {
-        typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase<T>(coeffs) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            XYZ2RGBConvert(src, &dst.x);
-            setAlpha(dst, ColorChannel<T>::max());
-            return dst;
-        }
-    };
-
-    template <typename T, int SRCCN, int DSTCN>
-    void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
-    {
-        typedef typename RGB2XYZ<SRCCN, DSTCN, T>::coeff_t coeff_t;
-        RGB2XYZ<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
-        {
-            {RGB2XYZ_caller<uchar, 3, 3>, RGB2XYZ_caller<uchar, 3, 4>},
-            {RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
-        };
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
-        {
-            {RGB2XYZ_caller<ushort, 3, 3>, RGB2XYZ_caller<ushort, 3, 4>},
-            {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
-        };
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] =
-        {
-            {RGB2XYZ_caller<float, 3, 3>, RGB2XYZ_caller<float, 3, 4>},
-            {RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
-        };
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-    template <typename T, int SRCCN, int DSTCN>
-    void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
-    {
-        typedef typename XYZ2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
-        XYZ2RGB<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
-        {
-            {XYZ2RGB_caller<uchar, 3, 3>, XYZ2RGB_caller<uchar, 3, 4>},
-            {XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
-        };
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-    void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
-        {
-            {XYZ2RGB_caller<ushort, 3, 3>, XYZ2RGB_caller<ushort, 3, 4>},
-            {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
-        };
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-    void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
-    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
-        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] =
-        {
-            {XYZ2RGB_caller<float, 3, 3>, XYZ2RGB_caller<float, 3, 4>},
-            {XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
-        };
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
-    }
-
-////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
-
-    __constant__ int cHsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
-    __constant__ int cHsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
-    __constant__ int cHsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
-
-    template <typename D> __device__ void RGB2HSVConvert(const uchar* src, D& dst, int bidx, int hr)
-    {
-        const int hsv_shift = 12;
-        const int* hdiv_table = hr == 180 ? cHsvDivTable180 : cHsvDivTable256;
-
-        int b = src[bidx], g = src[1], r = src[bidx^2];
-        int h, s, v = b;
-        int vmin = b, diff;
-        int vr, vg;
-
-        v = max(v, g);
-        v = max(v, r);
-        vmin = min(vmin, g);
-        vmin = min(vmin, r);
-
-        diff = v - vmin;
-        vr = v == r ? -1 : 0;
-        vg = v == g ? -1 : 0;
-
-        s = (diff * cHsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
-        h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
-        h += h < 0 ? hr : 0;
-
-        dst.x = saturate_cast<uchar>(h);
-        dst.y = (uchar)s;
-        dst.z = (uchar)v;
-    }
-    template<typename D> __device__ void RGB2HSVConvert(const float* src, D& dst, int bidx, int hr)
-    {
-        const float hscale = hr * (1.f / 360.f);
-
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h, s, v;
-
-        float vmin, diff;
-
-        v = vmin = r;
-        v = fmax(v, g);
-        v = fmax(v, b);
-        vmin = fmin(vmin, g);
-        vmin = fmin(vmin, b);
-
-        diff = v - vmin;
-        s = diff / (float)(fabs(v) + numeric_limits_gpu<float>::epsilon());
-        diff = (float)(60. / (diff + numeric_limits_gpu<float>::epsilon()));
-
-        if (v == r)
-            h = (g - b) * diff;
-        else if (v == g)
-            h = (b - r) * diff + 120.f;
-        else
-            h = (r - g) * diff + 240.f;
-
-        if (h < 0) h += 360.f;
-
-        dst.x = h * hscale;
-        dst.y = s;
-        dst.z = v;
-    }
-
-    template <int SRCCN, int DSTCN, typename T> struct RGB2HSV
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        RGB2HSV(int bidx, int hr) : bidx(bidx), hr(hr) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            RGB2HSVConvert(&src.x, dst, bidx, hr);
-            return dst;
-        }
-
-    private:
-        int bidx;
-        int hr;
-    };
-
-    __constant__ int cHsvSectorData[6][3] =
-    {
-        {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
-    };
-
-    template <typename T> __device__ void HSV2RGBConvert(const T& src, float* dst, int bidx, int hr)
-    {
-        const float hscale = 6.f / hr;
-
-        float h = src.x, s = src.y, v = src.z;
-        float b, g, r;
-
-        if( s == 0 )
-            b = g = r = v;
-        else
-        {
-            float tab[4];
-            int sector;
-            h *= hscale;
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-            sector = __float2int_rd(h);
-            h -= sector;
-
-            tab[0] = v;
-            tab[1] = v*(1.f - s);
-            tab[2] = v*(1.f - s*h);
-            tab[3] = v*(1.f - s*(1.f - h));
-
-            b = tab[cHsvSectorData[sector][0]];
-            g = tab[cHsvSectorData[sector][1]];
-            r = tab[cHsvSectorData[sector][2]];
-        }
-
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
-    }
-    template <typename T> __device__ void HSV2RGBConvert(const T& src, uchar* dst, int bidx, int hr)
-    {
-        float3 buf;
-
-        buf.x = src.x;
-        buf.y = src.y * (1.f/255.f);
-        buf.z = src.z * (1.f/255.f);
-
-        HSV2RGBConvert(buf, &buf.x, bidx, hr);
-
-        dst[0] = saturate_cast<uchar>(buf.x * 255.f);
-        dst[1] = saturate_cast<uchar>(buf.y * 255.f);
-        dst[2] = saturate_cast<uchar>(buf.z * 255.f);
-    }
-
-    template <int SRCCN, int DSTCN, typename T> struct HSV2RGB
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        HSV2RGB(int bidx, int hr) : bidx(bidx), hr(hr) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            HSV2RGBConvert(src, &dst.x, bidx, hr);
-            setAlpha(dst, ColorChannel<T>::max());
-            return dst;
-        }
-
-    private:
-        int bidx;
-        int hr;
-    };
-
-    template <typename T, int SRCCN, int DSTCN>
-    void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
-    {
-        RGB2HSV<SRCCN, DSTCN, T> cvt(bidx, hrange);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*RGB2HSV_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const RGB2HSV_caller_t RGB2HSV_callers[2][2] =
-        {
-            {RGB2HSV_caller<uchar, 3, 3>, RGB2HSV_caller<uchar, 3, 4>},
-            {RGB2HSV_caller<uchar, 4, 3>, RGB2HSV_caller<uchar, 4, 4>}
-        };
-
-        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-    void RGB2HSV_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*RGB2HSV_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const RGB2HSV_caller_t RGB2HSV_callers[2][2] =
-        {
-            {RGB2HSV_caller<float, 3, 3>, RGB2HSV_caller<float, 3, 4>},
-            {RGB2HSV_caller<float, 4, 3>, RGB2HSV_caller<float, 4, 4>}
-        };
-
-        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-    template <typename T, int SRCCN, int DSTCN>
-    void HSV2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
-    {
-        HSV2RGB<SRCCN, DSTCN, T> cvt(bidx, hrange);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*HSV2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const HSV2RGB_caller_t HSV2RGB_callers[2][2] =
-        {
-            {HSV2RGB_caller<uchar, 3, 3>, HSV2RGB_caller<uchar, 3, 4>},
-            {HSV2RGB_caller<uchar, 4, 3>, HSV2RGB_caller<uchar, 4, 4>}
-        };
-
-        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-    void HSV2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*HSV2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const HSV2RGB_caller_t HSV2RGB_callers[2][2] =
-        {
-            {HSV2RGB_caller<float, 3, 3>, HSV2RGB_caller<float, 3, 4>},
-            {HSV2RGB_caller<float, 4, 3>, HSV2RGB_caller<float, 4, 4>}
-        };
-
-        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
-
-    template <typename D> __device__ void RGB2HLSConvert(const float* src, D& dst, int bidx, int hr)
-    {
-        const float hscale = hr * (1.f / 360.f);
-
-        float b = src[bidx], g = src[1], r = src[bidx^2];
-        float h = 0.f, s = 0.f, l;
-        float vmin, vmax, diff;
-
-        vmax = vmin = r;
-        vmax = fmax(vmax, g);
-        vmax = fmax(vmax, b);
-        vmin = fmin(vmin, g);
-        vmin = fmin(vmin, b);
-
-        diff = vmax - vmin;
-        l = (vmax + vmin) * 0.5f;
-
-        if (diff > numeric_limits_gpu<float>::epsilon())
-        {
-            s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
-            diff = 60.f / diff;
-
-            if (vmax == r)
-                h = (g - b)*diff;
-            else if (vmax == g)
-                h = (b - r)*diff + 120.f;
-            else
-                h = (r - g)*diff + 240.f;
-
-            if (h < 0.f) h += 360.f;
-        }
-
-        dst.x = h * hscale;
-        dst.y = l;
-        dst.z = s;
-    }
-    template <typename D> __device__ void RGB2HLSConvert(const uchar* src, D& dst, int bidx, int hr)
-    {
-        float3 buf;
-
-        buf.x = src[0]*(1.f/255.f);
-        buf.y = src[1]*(1.f/255.f);
-        buf.z = src[2]*(1.f/255.f);
-
-        RGB2HLSConvert(&buf.x, buf, bidx, hr);
-
-        dst.x = saturate_cast<uchar>(buf.x);
-        dst.y = saturate_cast<uchar>(buf.y*255.f);
-        dst.z = saturate_cast<uchar>(buf.z*255.f);
-    }
-
-    template <int SRCCN, int DSTCN, typename T> struct RGB2HLS
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        RGB2HLS(int bidx, int hr) : bidx(bidx), hr(hr) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            RGB2HLSConvert(&src.x, dst, bidx, hr);
-            return dst;
-        }
-
-    private:
-        int bidx;
-        int hr;
-    };
-
-    __constant__ int cHlsSectorData[6][3] =
-    {
-        {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
-    };
-
-    template <typename T> __device__ void HLS2RGBConvert(const T& src, float* dst, int bidx, int hr)
-    {
-        const float hscale = 6.0f / hr;
-
-        float h = src.x, l = src.y, s = src.z;
-        float b, g, r;
-
-        if (s == 0)
-            b = g = r = l;
-        else
-        {
-            float tab[4];
-            int sector;
-
-            float p2 = l <= 0.5f ? l * (1 + s) : l + s - l * s;
-            float p1 = 2 * l - p2;
-
-            h *= hscale;
-
-            if( h < 0 )
-                do h += 6; while( h < 0 );
-            else if( h >= 6 )
-                do h -= 6; while( h >= 6 );
-
-            sector = __float2int_rd(h);
-            h -= sector;
-
-            tab[0] = p2;
-            tab[1] = p1;
-            tab[2] = p1 + (p2 - p1) * (1 - h);
-            tab[3] = p1 + (p2 - p1) * h;
-
-            b = tab[cHlsSectorData[sector][0]];
-            g = tab[cHlsSectorData[sector][1]];
-            r = tab[cHlsSectorData[sector][2]];
-        }
-
-        dst[bidx] = b;
-        dst[1] = g;
-        dst[bidx^2] = r;
-    }
-    template <typename T> __device__ void HLS2RGBConvert(const T& src, uchar* dst, int bidx, int hr)
-    {
-        float3 buf;
-
-        buf.x = src.x;
-        buf.y = src.y*(1.f/255.f);
-        buf.z = src.z*(1.f/255.f);
-
-        HLS2RGBConvert(buf, &buf.x, bidx, hr);
-
-        dst[0] = saturate_cast<uchar>(buf.x*255.f);
-        dst[1] = saturate_cast<uchar>(buf.y*255.f);
-        dst[2] = saturate_cast<uchar>(buf.z*255.f);
-    }
-
-    template <int SRCCN, int DSTCN, typename T> struct HLS2RGB
-    {
-        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
-        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
-
-        HLS2RGB(int bidx, int hr) : bidx(bidx), hr(hr) {}
-
-        __device__ __forceinline__ dst_t operator()(const src_t& src) const
-        {
-            dst_t dst;
-            HLS2RGBConvert(src, &dst.x, bidx, hr);
-            setAlpha(dst, ColorChannel<T>::max());
-            return dst;
-        }
-
-    private:
-        int bidx;
-        int hr;
-    };
-
-    template <typename T, int SRCCN, int DSTCN>
-    void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
-    {
-        RGB2HLS<SRCCN, DSTCN, T> cvt(bidx, hrange);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*RGB2HLS_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const RGB2HLS_caller_t RGB2HLS_callers[2][2] =
-        {
-            {RGB2HLS_caller<uchar, 3, 3>, RGB2HLS_caller<uchar, 3, 4>},
-            {RGB2HLS_caller<uchar, 4, 3>, RGB2HLS_caller<uchar, 4, 4>}
-        };
-
-        RGB2HLS_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-    void RGB2HLS_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*RGB2HLS_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const RGB2HLS_caller_t RGB2HLS_callers[2][2] =
-        {
-            {RGB2HLS_caller<float, 3, 3>, RGB2HLS_caller<float, 3, 4>},
-            {RGB2HLS_caller<float, 4, 3>, RGB2HLS_caller<float, 4, 4>}
-        };
-
-        RGB2HLS_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-
-    template <typename T, int SRCCN, int DSTCN>
-    void HLS2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
-    {
-        HLS2RGB<SRCCN, DSTCN, T> cvt(bidx, hrange);
-        callConvert(src, dst, cvt, stream);
-    }
-
-    void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*HLS2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const HLS2RGB_caller_t HLS2RGB_callers[2][2] =
-        {
-            {HLS2RGB_caller<uchar, 3, 3>, HLS2RGB_caller<uchar, 3, 4>},
-            {HLS2RGB_caller<uchar, 4, 3>, HLS2RGB_caller<uchar, 4, 4>}
-        };
-
-        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
-
-    void HLS2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
-    {
-        typedef void (*HLS2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream);
-        static const HLS2RGB_caller_t HLS2RGB_callers[2][2] =
-        {
-            {HLS2RGB_caller<float, 3, 3>, HLS2RGB_caller<float, 3, 4>},
-            {HLS2RGB_caller<float, 4, 3>, HLS2RGB_caller<float, 4, 4>}
-        };
-
-        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
-    }
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+        void name(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream) \
+        { \
+            traits::functor_type functor = traits::create_functor(); \
+            typedef typename traits::functor_type::argument_type src_t; \
+            typedef typename traits::functor_type::result_type   dst_t; \
+            transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
+        }
+
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
 }}}
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 84345547ed62f1d406eb38f823b1a820c134dba3..69020dd22fe05e0ecc3e3448fed126e4e0667667 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -40,9 +40,10 @@
 //
 //M*/
 
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "internal_shared.hpp"
 
@@ -354,114 +355,11 @@ namespace cv { namespace gpu { namespace mathfunc
 
     //////////////////////////////////////////////////////////////////////////
     // min/max
-
-    struct MinOp
-    {        
-        template <typename T>
-        __device__ __forceinline__ T operator()(T a, T b)
-        {
-            return min(a, b);
-        }
-        __device__ __forceinline__ float operator()(float a, float b)
-        {
-            return fmin(a, b);
-        }
-        __device__ __forceinline__ double operator()(double a, double b)
-        {
-            return fmin(a, b);
-        }
-    };
-
-    struct MaxOp
-    {        
-        template <typename T>
-        __device__ __forceinline__ T operator()(T a, T b)
-        {
-            return max(a, b);
-        }
-        __device__ __forceinline__ float operator()(float a, float b)
-        {
-            return fmax(a, b);
-        }
-        __device__ __forceinline__ double operator()(double a, double b)
-        {
-            return fmax(a, b);
-        }
-    };
-    
-    template <typename T> struct ScalarMinOp
-    {
-        T s;
-
-        explicit ScalarMinOp(T s_) : s(s_) {}
-
-        __device__ __forceinline__ T operator()(T a)
-        {
-            return min(a, s);
-        }
-    };
-    template <> struct ScalarMinOp<float>
-    {
-        float s;
-
-        explicit ScalarMinOp(float s_) : s(s_) {}
-
-        __device__ __forceinline__ float operator()(float a)
-        {
-            return fmin(a, s);
-        }
-    };
-    template <> struct ScalarMinOp<double>
-    {
-        double s;
-
-        explicit ScalarMinOp(double s_) : s(s_) {}
-
-        __device__ __forceinline__ double operator()(double a)
-        {
-            return fmin(a, s);
-        }
-    };
-    
-    template <typename T> struct ScalarMaxOp
-    {
-        T s;
-
-        explicit ScalarMaxOp(T s_) : s(s_) {}
-
-        __device__ __forceinline__ T operator()(T a)
-        {
-            return max(a, s);
-        }
-    };
-    template <> struct ScalarMaxOp<float>
-    {
-        float s;
-
-        explicit ScalarMaxOp(float s_) : s(s_) {}
-
-        __device__ __forceinline__ float operator()(float a)
-        {
-            return fmax(a, s);
-        }
-    };
-    template <> struct ScalarMaxOp<double>
-    {
-        double s;
-
-        explicit ScalarMaxOp(double s_) : s(s_) {}
-
-        __device__ __forceinline__ double operator()(double a)
-        {
-            return fmax(a, s);
-        }
-    };
     
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        MinOp op;
-        transform(src1, src2, dst, op, stream);    
+        transform(src1, src2, dst, minimum<T>(), stream);    
     }
 
     template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@@ -475,8 +373,7 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T>
     void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        MaxOp op;
-        transform(src1, src2, dst, op, stream);    
+        transform(src1, src2, dst, maximum<T>(), stream);    
     }
     
     template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
@@ -490,8 +387,7 @@ namespace cv { namespace gpu { namespace mathfunc
     template <typename T>
     void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        ScalarMinOp<T> op(src2);
-        transform(src1, dst, op, stream);    
+        transform(src1, dst, device::bind2nd(minimum<T>(), src2), stream);    
     }
 
     template void min_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
@@ -501,12 +397,11 @@ namespace cv { namespace gpu { namespace mathfunc
     template void min_gpu<int   >(const DevMem2D_<int>& src1, int src2, const DevMem2D_<int>& dst, cudaStream_t stream);
     template void min_gpu<float >(const DevMem2D_<float>& src1, float src2, const DevMem2D_<float>& dst, cudaStream_t stream);
     template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
-    
+
     template <typename T>
     void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
     {
-        ScalarMaxOp<T> op(src2);
-        transform(src1, dst, op, stream);    
+        transform(src1, dst, device::bind2nd(maximum<T>(), src2), stream);    
     }
 
     template void max_gpu<uchar >(const DevMem2D& src1, uchar src2, const DevMem2D& dst, cudaStream_t stream);
@@ -519,100 +414,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
     
     //////////////////////////////////////////////////////////////////////////
-    // threshold
-
-    template <typename T> struct ThreshBinary
-    {
-        ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? maxVal : 0;
-        }
-
-    private:
-        T thresh;
-        T maxVal;
-    };
-
-    template <typename T> struct ThreshBinaryInv
-    {
-        ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? 0 : maxVal;
-        }
-
-    private:
-        T thresh;
-        T maxVal;
-    };
-
-    template <typename T> struct ThreshTrunc
-    {
-        ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return min(src, thresh);
-        }
-
-    private:
-        T thresh;
-    };
-    template <> struct  ThreshTrunc<float>
-    {
-        ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
-
-        __device__ __forceinline__ float operator()(const float& src) const
-        {
-            return fmin(src, thresh);
-        }
-
-    private:
-        float thresh;
-    };
-    template <> struct  ThreshTrunc<double>
-    {
-        ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
-
-        __device__ __forceinline__ double operator()(const double& src) const
-        {
-            return fmin(src, thresh);
-        }
-
-    private:
-        double thresh;
-    };
-
-    template <typename T> struct ThreshToZero
-    {
-    public:
-        ThreshToZero(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? src : 0;
-        }
-
-    private:
-        T thresh;
-    };
-
-    template <typename T> struct ThreshToZeroInv
-    {
-    public:
-        ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(const T& src) const
-        {
-            return src > thresh ? 0 : src;
-        }
-
-    private:
-        T thresh;
-    };
+    // threshold  
 
     template <template <typename> class Op, typename T>
     void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, 
@@ -631,11 +433,11 @@ namespace cv { namespace gpu { namespace mathfunc
 
         static const caller_t callers[] = 
         {
-            threshold_caller<ThreshBinary, T>, 
-            threshold_caller<ThreshBinaryInv, T>, 
-            threshold_caller<ThreshTrunc, T>, 
-            threshold_caller<ThreshToZero, T>, 
-            threshold_caller<ThreshToZeroInv, T>
+            threshold_caller<thresh_binary_func, T>, 
+            threshold_caller<thresh_binary_inv_func, T>, 
+            threshold_caller<thresh_trunc_func, T>, 
+            threshold_caller<thresh_to_zero_func, T>, 
+            threshold_caller<thresh_to_zero_inv_func, T>
         };
 
         callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
@@ -653,20 +455,10 @@ namespace cv { namespace gpu { namespace mathfunc
     //////////////////////////////////////////////////////////////////////////
     // subtract
 
-    template <typename T>
-    class SubtractOp
-    {
-    public:
-        __device__ __forceinline__ T operator()(const T& l, const T& r) const
-        {
-            return l - r;
-        }
-    };
-
     template <typename T>
     void subtractCaller(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream)
     {
-        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, SubtractOp<T>(), stream);
+        transform((DevMem2D_<T>)src1, (DevMem2D_<T>)src2, (DevMem2D_<T>)dst, minus<T>(), stream);
     }
 
     template void subtractCaller<short>(const DevMem2D src1, const DevMem2D src2, DevMem2D dst, cudaStream_t stream);
@@ -675,7 +467,7 @@ namespace cv { namespace gpu { namespace mathfunc
     //////////////////////////////////////////////////////////////////////////
     // pow
     
-    template<typename T, bool Signed = device::numeric_limits_gpu<T>::is_signed>
+    template<typename T, bool Signed = device::numeric_limits<T>::is_signed>
     struct PowOp
     {    
         float power;
@@ -695,7 +487,7 @@ namespace cv { namespace gpu { namespace mathfunc
 
         __device__ __forceinline__ float operator()(const T& e)
         {
-              T res = saturate_cast<T>(__powf((float)e, power));            
+            T res = saturate_cast<T>(__powf((float)e, power));            
             
             if ( (e < 0) && (1 & (int)power) )
                     res *= -1;            
diff --git a/modules/gpu/src/cuda/filters.cu b/modules/gpu/src/cuda/filters.cu
index 779da8725d8ed3f2f2ef41355e3dc6f58fc00ff1..9ca2825a5b71eb069af75262b6fd754ee3e54530 100644
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -42,8 +42,8 @@
 
 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
 
 #include "safe_call.hpp"
@@ -76,7 +76,7 @@ namespace filter_krnls
 {
     template <typename T, size_t size> struct SmemType_
     {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t smem_t;
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
     };
     template <typename T> struct SmemType_<T, 4>
     {
@@ -111,7 +111,7 @@ namespace filter_krnls
 
             if (x < src.cols)
             {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
                 sum_t sum = VecTraits<sum_t>::all(0);
 
                 sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
@@ -253,7 +253,7 @@ namespace filter_krnls
 
             if (y < src.rows)
             {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_t sum_t;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
                 sum_t sum = VecTraits<sum_t>::all(0);
 
                 sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
@@ -475,7 +475,7 @@ namespace bf_krnls
                     }
                 }
 
-                float minimum = numeric_limits_gpu<float>::max();
+                float minimum = numeric_limits<float>::max();
                 int id = 0;
 
                 if (cost[0] < minimum)
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index fa5b40b48edf7f1920799a26e1ebc7f11a875565..cfa6427f0a49f42fb8813f2da596b2f09b62d130 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -42,6 +42,7 @@
 //M*/
 
 #include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
 using namespace cv::gpu;
@@ -50,14 +51,11 @@ using namespace cv::gpu::device;
 
 #define UINT_BITS 32U
 
-#define LOG2_WARP_SIZE 5U
-#define WARP_SIZE (1U << LOG2_WARP_SIZE)
-
 //Warps == subhistograms per threadblock
 #define WARP_COUNT 6
 
 //Threadblock size
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)
+#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
 #define HISTOGRAM256_BIN_COUNT 256
 
 //Shared memory per threadblock
@@ -73,7 +71,7 @@ namespace cv { namespace gpu { namespace histograms
 {
     #if (!USE_SMEM_ATOMICS)
 
-        #define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )
+        #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
 
         __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
         {
@@ -111,7 +109,7 @@ namespace cv { namespace gpu { namespace histograms
     {
         //Per-warp subhistogram storage
         __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-        uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+        uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
 
         //Clear shared memory storage for current threadblock before processing
         #pragma unroll
@@ -119,7 +117,7 @@ namespace cv { namespace gpu { namespace histograms
            s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
 
         //Cycle through the entire data set, update subhistograms for each warp
-        const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);
+        const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
 
         __syncthreads();
         const uint colsui = d_Data.step / sizeof(uint);
diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu
index a46ddc31f193be33a252a5985f4d0267d8463d51..301a0b65592a1bf9988e8b930c9bfa2635d08371 100644
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -41,7 +41,7 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -84,8 +84,8 @@ __global__ void matchTemplateNaiveKernel_CCORR(
         int w, int h, const PtrStep image, const PtrStep templ, 
         DevMem2Df result)
 {
-    typedef typename TypeVec<T, cn>::vec_t Type;
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<T, cn>::vec_type Type;
+    typedef typename TypeVec<float, cn>::vec_type Typef;
 
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -174,8 +174,8 @@ __global__ void matchTemplateNaiveKernel_SQDIFF(
         int w, int h, const PtrStep image, const PtrStep templ, 
         DevMem2Df result)
 {
-    typedef typename TypeVec<T, cn>::vec_t Type;
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<T, cn>::vec_type Type;
+    typedef typename TypeVec<float, cn>::vec_type Typef;
 
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -884,7 +884,7 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
 template <int cn>
 __global__ void extractFirstChannel_32F(const PtrStep image, DevMem2Df result)
 {
-    typedef typename TypeVec<float, cn>::vec_t Typef;
+    typedef typename TypeVec<float, cn>::vec_type Typef;
 
     int x = blockDim.x * blockIdx.x + threadIdx.x;
     int y = blockDim.y * blockIdx.y + threadIdx.y;
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index f68562587d3482b366db3fa20be43c8dd484d30f..b5ee97303835843875f9f7dea738bb2c413f95e8 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -40,9 +40,9 @@
 //
 //M*/
 
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index 27224c6f7846ea96c66dd7c86472356c6b839723..60e6c886e88fd705bbe2e2fcf5629052ff35f694 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -40,9 +40,9 @@
 //
 //M*/
 
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "internal_shared.hpp"
 
@@ -190,8 +190,8 @@ namespace cv { namespace gpu { namespace mathfunc
         uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
         uint tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
+        T mymin = numeric_limits<T>::max();
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
         uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
         uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
         for (uint y = y0; y < y_end; y += blockDim.y)
@@ -512,9 +512,9 @@ namespace cv { namespace gpu { namespace mathfunc
         uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
         uint tid = threadIdx.y * blockDim.x + threadIdx.x;
 
-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : 
-                                                     numeric_limits_gpu<T>::min(); 
+        T mymin = numeric_limits<T>::max();
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : 
+                                                     numeric_limits<T>::min(); 
         uint myminloc = 0;
         uint mymaxloc = 0;
         uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
@@ -1094,10 +1094,10 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
+    __global__ void sumKernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_type* result)
     {
-        typedef typename TypeVec<T, 2>::vec_t SrcType;
-        typedef typename TypeVec<R, 2>::vec_t DstType;
+        typedef typename TypeVec<T, 2>::vec_type SrcType;
+        typedef typename TypeVec<R, 2>::vec_type DstType;
 
         __shared__ R smem[nthreads * 2];
 
@@ -1173,9 +1173,9 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C2(typename TypeVec<R, 2>::vec_type* result, int size)
     {
-        typedef typename TypeVec<R, 2>::vec_t DstType;
+        typedef typename TypeVec<R, 2>::vec_type DstType;
 
         __shared__ R smem[nthreads * 2];
 
@@ -1199,10 +1199,10 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
+    __global__ void sumKernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_type* result)
     {
-        typedef typename TypeVec<T, 3>::vec_t SrcType;
-        typedef typename TypeVec<R, 3>::vec_t DstType;
+        typedef typename TypeVec<T, 3>::vec_type SrcType;
+        typedef typename TypeVec<R, 3>::vec_type DstType;
 
         __shared__ R smem[nthreads * 3];
 
@@ -1285,9 +1285,9 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C3(typename TypeVec<R, 3>::vec_type* result, int size)
     {
-        typedef typename TypeVec<R, 3>::vec_t DstType;
+        typedef typename TypeVec<R, 3>::vec_type DstType;
 
         __shared__ R smem[nthreads * 3];
 
@@ -1313,10 +1313,10 @@ namespace cv { namespace gpu { namespace mathfunc
     }
 
     template <typename T, typename R, typename Op, int nthreads>
-    __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
+    __global__ void sumKernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_type* result)
     {
-        typedef typename TypeVec<T, 4>::vec_t SrcType;
-        typedef typename TypeVec<R, 4>::vec_t DstType;
+        typedef typename TypeVec<T, 4>::vec_type SrcType;
+        typedef typename TypeVec<R, 4>::vec_type DstType;
 
         __shared__ R smem[nthreads * 4];
 
@@ -1407,9 +1407,9 @@ namespace cv { namespace gpu { namespace mathfunc
 
 
     template <typename T, typename R, int nthreads>
-    __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
+    __global__ void sumPass2Kernel_C4(typename TypeVec<R, 4>::vec_type* result, int size)
     {
-        typedef typename TypeVec<R, 4>::vec_t DstType;
+        typedef typename TypeVec<R, 4>::vec_type DstType;
 
         __shared__ R smem[nthreads * 4];
 
@@ -1454,41 +1454,41 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 2:
             sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 3:
             sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 4:
             sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
@@ -1526,19 +1526,19 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             break;
         case 2:
             sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             break;
         case 3:
             sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             break;
         case 4:
             sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             break;
         }
         cudaSafeCall( cudaGetLastError() );
@@ -1576,41 +1576,41 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 2:
             sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 3:
             sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 4:
             sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
@@ -1648,19 +1648,19 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             break;
         case 2:
             sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             break;
         case 3:
             sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             break;
         case 4:
             sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             break;
         }
         cudaSafeCall( cudaGetLastError() );
@@ -1698,41 +1698,41 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 1>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 2:
             sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 2>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 3:
             sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 3>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
         case 4:
             sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             cudaSafeCall( cudaGetLastError() );
 
             sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
-                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+                    (typename TypeVec<R, 4>::vec_type*)buf.ptr(0), grid.x * grid.y);
             cudaSafeCall( cudaGetLastError() );
 
             break;
@@ -1770,19 +1770,19 @@ namespace cv { namespace gpu { namespace mathfunc
         {
         case 1:
             sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 1>::vec_type*)buf.ptr(0));
             break;
         case 2:
             sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 2>::vec_type*)buf.ptr(0));
             break;
         case 3:
             sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 3>::vec_type*)buf.ptr(0));
             break;
         case 4:
             sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
-                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+                    src, (typename TypeVec<R, 4>::vec_type*)buf.ptr(0));
             break;
         }
         cudaSafeCall( cudaGetLastError() );
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index b025914b332251022205c2480d0d79b7c2f8520e..6d1a5f4bed8e31c3bba4dce5ff6d6132ba5db711 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -42,7 +42,7 @@
 
 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "safe_call.hpp"
 
 using namespace cv::gpu;
@@ -381,7 +381,7 @@ namespace cv { namespace gpu { namespace bp
     template <typename T>
     __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
     {
-        float minimum = numeric_limits_gpu<float>::max();
+        float minimum = numeric_limits<float>::max();
 
         for(int i = 0; i < cndisp; ++i)
         {
@@ -486,7 +486,7 @@ namespace cv { namespace gpu { namespace bp
             size_t disp_step = disp.rows * u.step;
 
             int best = 0;
-            float best_val = numeric_limits_gpu<float>::max();
+            float best_val = numeric_limits<float>::max();
             for (int d = 0; d < cndisp; ++d)
             {
                 float val  = us[d * disp_step];
diff --git a/modules/gpu/src/cuda/stereocsbp.cu b/modules/gpu/src/cuda/stereocsbp.cu
index bce1f0769e9dda77ed3aaa3b7d3aee51cb73a490..5ed8938e039b8bf5c2fa822b2b338fde2ab94b69 100644
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -42,7 +42,7 @@
 
 #include "opencv2/gpu/devmem2d.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "safe_call.hpp"
 
 using namespace cv::gpu;
@@ -147,7 +147,7 @@ namespace cv { namespace gpu { namespace csbp
 
             for(int i = 0; i < nr_plane; i++)
             {
-                T minimum = numeric_limits_gpu<T>::max();
+                T minimum = numeric_limits<T>::max();
                 int id = 0;
                 for(int d = 0; d < cndisp; d++)
                 {
@@ -161,7 +161,7 @@ namespace cv { namespace gpu { namespace csbp
 
                 data_cost_selected[i  * cdisp_step1] = minimum;
                 selected_disparity[i  * cdisp_step1] = id;
-                data_cost         [id * cdisp_step1] = numeric_limits_gpu<T>::max();
+                data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
             }
         }
     }
@@ -192,7 +192,7 @@ namespace cv { namespace gpu { namespace csbp
                     data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
                     selected_disparity[nr_local_minimum * cdisp_step1] = d;
 
-                    data_cost[d * cdisp_step1] = numeric_limits_gpu<T>::max();
+                    data_cost[d * cdisp_step1] = numeric_limits<T>::max();
 
                     nr_local_minimum++;
                 }
@@ -203,7 +203,7 @@ namespace cv { namespace gpu { namespace csbp
 
             for (int i = nr_local_minimum; i < nr_plane; i++)
             {
-                T minimum = numeric_limits_gpu<T>::max();
+                T minimum = numeric_limits<T>::max();
                 int id = 0;
 
                 for (int d = 0; d < cndisp; d++)
@@ -218,7 +218,7 @@ namespace cv { namespace gpu { namespace csbp
                 data_cost_selected[i * cdisp_step1] = minimum;
                 selected_disparity[i * cdisp_step1] = id;
 
-                data_cost[id * cdisp_step1] = numeric_limits_gpu<T>::max();
+                data_cost[id * cdisp_step1] = numeric_limits<T>::max();
             }
         }
     }
@@ -610,7 +610,7 @@ namespace cv { namespace gpu { namespace csbp
     {
         for(int i = 0; i < nr_plane; i++)
         {
-            T minimum = numeric_limits_gpu<T>::max();
+            T minimum = numeric_limits<T>::max();
             int id = 0;
             for(int j = 0; j < nr_plane2; j++)
             {
@@ -630,7 +630,7 @@ namespace cv { namespace gpu { namespace csbp
             l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
             r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
 
-            data_cost_new[id * cdisp_step1] = numeric_limits_gpu<T>::max();
+            data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
         }
     }
 
@@ -737,7 +737,7 @@ namespace cv { namespace gpu { namespace csbp
     __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
                                       const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
     {
-        T minimum = numeric_limits_gpu<T>::max();
+        T minimum = numeric_limits<T>::max();
 
         for(int d = 0; d < nr_plane; d++)
         {
@@ -850,7 +850,7 @@ namespace cv { namespace gpu { namespace csbp
             const T* r = r_ + (y+0) * cmsg_step1 + (x-1);
 
             int best = 0;
-            T best_val = numeric_limits_gpu<T>::max();
+            T best_val = numeric_limits<T>::max();
             for (int i = 0; i < nr_plane; ++i)
             {
                 int idx = i * cdisp_step1;
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 3d0a74701afbfb00c26940c740f53162d44de4ef..363e2815e49e2cf745160f42b3e668222efc25b8 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -46,8 +46,10 @@
 //M*/
 
 #include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"
 
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -393,31 +395,10 @@ namespace cv { namespace gpu { namespace surf
             //dss
             H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
 
-            float det = H[0][0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1])
-              - H[0][1] * (H[1][0] * H[2][2] - H[1][2] * H[2][0])
-              + H[0][2] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]);
+            __shared__ float x[3];
 
-            if (det != 0.0f)
+            if (solve3x3(H, dD, x))
             {
-                float invdet = 1.0f / det;
-
-                __shared__ float x[3];
-
-                x[0] = invdet * 
-                    (dD[0] * (H[1][1] * H[2][2] - H[1][2] * H[2][1]) -
-                     H[0][1] * (dD[1] * H[2][2] - H[1][2] * dD[2]) +
-                     H[0][2] * (dD[1] * H[2][1] - H[1][1] * dD[2]));
-
-                x[1] = invdet * 
-                    (H[0][0] * (dD[1] * H[2][2] - H[1][2] * dD[2]) -
-                     dD[0] * (H[1][0] * H[2][2] - H[1][2] * H[2][0]) +
-                     H[0][2] * (H[1][0] * dD[2] - dD[1] * H[2][0]));
-
-                x[2] = invdet * 
-                    (H[0][0] * (H[1][1] * dD[2] - dD[1] * H[2][1]) -
-                     H[0][1] * (H[1][0] * dD[2] - dD[1] * H[2][0]) +
-                     dD[0] * (H[1][0] * H[2][1] - H[1][1] * H[2][0]));
-
                 if (fabs(x[0]) <= 1.f && fabs(x[1]) <= 1.f && fabs(x[2]) <= 1.f)
                 {
                     // if the step is within the interpolation region, perform it
@@ -500,20 +481,6 @@ namespace cv { namespace gpu { namespace surf
     __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
     __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
 
-    __device__ void reduceSum32(volatile float* v_sum, float& sum)
-    {
-        v_sum[threadIdx.x] = sum;
-
-        if (threadIdx.x < 16)
-        {
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 16];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 8];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 4];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 2];
-            v_sum[threadIdx.x] = sum += v_sum[threadIdx.x + 1];
-        }
-    }
-
     __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
     {        
         #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
@@ -599,8 +566,11 @@ namespace cv { namespace gpu { namespace surf
 
                 float* s_sum_row = s_sum + threadIdx.y * 32;
 
-                reduceSum32(s_sum_row, sumx);
-                reduceSum32(s_sum_row, sumy);
+                //reduceSum32(s_sum_row, sumx);
+                //reduceSum32(s_sum_row, sumy);
+
+                warpReduce32(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
+                warpReduce32(s_sum_row, sumy, threadIdx.x, plus<volatile float>());
 
                 const float temp_mod = sumx * sumx + sumy * sumy;
                 if (temp_mod > best_mod)
diff --git a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
index 346ff847c741f9c87ab01d3acf523d07df88cc81..7539e8bfb2096bffac88f481d93fbb9d5f6f5144 100644
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -43,8 +43,8 @@
 #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
 #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
 
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vecmath.hpp"
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -72,64 +72,53 @@ namespace cv { namespace gpu { namespace device
             return -last <= mini && maxi <= 2 * last;
         }
 
-    private:
         int last;
     };
 
-
-    template <typename D>
-    struct BrdRowReflect101: BrdReflect101
+    template <typename D> struct BrdRowReflect101 : BrdReflect101
     {
         explicit BrdRowReflect101(int len): BrdReflect101(len) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return saturate_cast<D>(data[idx_low(i)]);
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return saturate_cast<D>(data[idx_high(i)]);
         }
     };
 
-
-    template <typename D>
-    struct BrdColReflect101: BrdReflect101
+    template <typename D> struct BrdColReflect101 : BrdReflect101
     {
         BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
         }
 
-    private:
         int step;
     };
 
-
     struct BrdReplicate
     {
         explicit BrdReplicate(int len): last(len - 1) {}
 
         __device__ __forceinline__ int idx_low(int i) const
         {
-            return max(i, 0);
+            return ::max(i, 0);
         }
 
         __device__ __forceinline__ int idx_high(int i) const 
         {
-            return min(i, last);
+            return ::min(i, last);
         }
 
         __device__ __forceinline__ int idx(int i) const
@@ -142,64 +131,52 @@ namespace cv { namespace gpu { namespace device
             return true;
         }
 
-    private:
         int last;
     };
 
-
-    template <typename D>
-    struct BrdRowReplicate: BrdReplicate
+    template <typename D> struct BrdRowReplicate : BrdReplicate
     {
         explicit BrdRowReplicate(int len): BrdReplicate(len) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return saturate_cast<D>(data[idx_low(i)]);
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return saturate_cast<D>(data[idx_high(i)]);
         }
     };
 
 
-    template <typename D>
-    struct BrdColReplicate: BrdReplicate
+    template <typename D> struct BrdColReplicate : BrdReplicate
     {
         BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
         }
 
-    private:
         int step;
     };
 
-    template <typename D>
-    struct BrdRowConstant
+    template <typename D> struct BrdRowConstant
     {
         explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return i >= 0 ? saturate_cast<D>(data[i]) : val;
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return i < len ? saturate_cast<D>(data[i]) : val;
         }
@@ -209,24 +186,20 @@ namespace cv { namespace gpu { namespace device
             return true;
         }
 
-    private:
         int len;
         D val;
     };
 
-    template <typename D>
-    struct BrdColConstant
+    template <typename D> struct BrdColConstant
     {
         BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
 
-        template <typename T>
-        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
         {
             return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
         }
 
-        template <typename T>
-        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
         {
             return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
         }
@@ -236,15 +209,12 @@ namespace cv { namespace gpu { namespace device
             return true;
         }
 
-    private:
         int len;
         int step;
         D val;
     };
 
-
-    template <typename OutT>
-    struct BrdConstant
+    template <typename OutT> struct BrdConstant
     {
         BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}
 
@@ -255,11 +225,9 @@ namespace cv { namespace gpu { namespace device
             return val;
         }
 
-    private:
         int w, h;
         OutT val;
     };
-
 }}}
 
 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/color.hpp b/modules/gpu/src/opencv2/gpu/device/color.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d620ead48165090fc8a9a747b0f1ef57153d198b
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@@ -0,0 +1,221 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COLOR_HPP__
+#define __OPENCV_GPU_COLOR_HPP__
+
+#include "detail/color.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+    
+#undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+    
+#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
+
+#undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+#undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+}}}
+
+#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
index 2602991c850385cff7ea64fbcc3cdd99e541743e..407aea2f934bce156bcbc40e771be1d1799ba6a7 100644
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -44,6 +44,7 @@
 #define __OPENCV_GPU_DATAMOV_UTILS_HPP__
 
 #include "internal_shared.hpp"
+#include "utility.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
@@ -55,49 +56,40 @@ namespace cv { namespace gpu { namespace device
             __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
         };
             
-    #else // __CUDA_ARCH__ >= 200
-
-        #if defined(_WIN64) || defined(__LP64__)		
-            // 64-bit register modifier for inlined asm
-            #define _OPENCV_ASM_PTR_ "l"
-        #else	
-            // 32-bit register modifier for inlined asm
-            #define _OPENCV_ASM_PTR_ "r"
-        #endif
+    #else // __CUDA_ARCH__ >= 200        
 
         template<class T> struct ForceGlob;
 
-        #define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod)                                                   \
-        template <> struct ForceGlob<base_type>                                                                   \
-        {                                                                                                         \
-            __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)         \
-            {                                                                                                     \
-                asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset));       \
-            }                                                                                                     \
-        };
-        #define DEFINE_FORCE_GLOB_B(base_type, ptx_type)                                                                          \
-        template <> struct ForceGlob<base_type>                                                                                   \
-        {                                                                                                                         \
-            __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)                         \
-            {                                                                                                                     \
-                asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : _OPENCV_ASM_PTR_(ptr + offset));   \
-            }                                                                                                                     \
-        };
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
         
-        DEFINE_FORCE_GLOB_B(uchar,  u8)
-        DEFINE_FORCE_GLOB_B(schar,  s8)
-        DEFINE_FORCE_GLOB_B(char,   b8)
-        DEFINE_FORCE_GLOB  (ushort, u16, h)
-        DEFINE_FORCE_GLOB  (short,  s16, h)
-        DEFINE_FORCE_GLOB  (uint,   u32, r)
-        DEFINE_FORCE_GLOB  (int,    s32, r)	
-        DEFINE_FORCE_GLOB  (float,  f32, f)	
-        DEFINE_FORCE_GLOB  (double, f64, d)	
-            
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
+        OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            
 
-    #undef DEFINE_FORCE_GLOB
-    #undef DEFINE_FORCE_GLOB_B
-    #undef _OPENCV_ASM_PTR_
+    #undef OPENCV_GPU_DEFINE_FORCE_GLOB
+    #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
         
     #endif // __CUDA_ARCH__ >= 200
 }}}
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/color.hpp b/modules/gpu/src/opencv2/gpu/device/detail/color.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..30f8e5050fd1a04b10b31568d611dbfeeb7140de
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color.hpp
@@ -0,0 +1,1037 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__
+#define __OPENCV_GPU_COLOR_DETAIL_HPP__
+
+#include "vec_traits.hpp"
+#include "saturate_cast.hpp"
+#include "limits.hpp"
+#include "functional.hpp"
+
+#ifndef CV_DESCALE
+    #define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
+#endif
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        template<typename T> struct ColorChannel
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ T max() { return numeric_limits<T>::max(); }
+            static __device__ __forceinline__ T half() { return (T)(max()/2 + 1); }
+        };
+        template<> struct ColorChannel<float>
+        {
+            typedef float worktype_f;
+            static __device__ __forceinline__ float max() { return 1.f; }
+            static __device__ __forceinline__ float half() { return 0.5f; }
+        };
+
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_type& vec, T val)
+        {
+        }
+        template <typename T> static __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_type& vec, T val)
+        {
+            vec.w = val;
+        }
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_type& vec)
+        {
+            return ColorChannel<T>::max();
+        }
+        template <typename T> static __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_type& vec)
+        {
+            return vec.w;
+        }
+
+        enum
+        {
+            yuv_shift  = 14,
+            xyz_shift  = 12,
+            R2Y        = 4899,
+            G2Y        = 9617,
+            B2Y        = 1868,
+            BLOCK_SIZE = 256
+        };
+    }
+
+////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
+
+    namespace detail
+    {
+        template <typename T, typename D, int bidx> struct RGB2RGB : public unary_function<T, D>
+        {
+            __device__ D operator()(const T& src) const
+            {
+                D dst;
+                dst.x = (&src.x)[bidx];
+                dst.y = src.y;
+                dst.z = (&src.x)[bidx^2];
+                setAlpha(dst, getAlpha<typename VecTraits<T>::elem_type>(src));
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+/////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
+
+    namespace detail
+    {
+        template <int GREEN_BITS, int bidx> struct RGB2RGB5x5Converter;
+        template<int bidx> struct RGB2RGB5x5Converter<6, bidx> 
+        {
+            template <typename T> static __device__ __forceinline__ ushort cvt(const T& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
+            }
+        };
+        template<int bidx> struct RGB2RGB5x5Converter<5, bidx> 
+        {
+            static __device__ __forceinline__ ushort cvt(const uchar3& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
+            }
+            static __device__ __forceinline__ ushort cvt(const uchar4& src)
+            {
+                return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
+            }
+        };
+
+        template<typename T, int bidx, int GREEN_BITS> struct RGB2RGB5x5 : public unary_function<T, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(const T& src) const
+            {
+                return RGB2RGB5x5Converter<GREEN_BITS, bidx>::cvt(src);
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(name, scn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef detail::RGB2RGB5x5<uchar ## scn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2RGB5x5<uchar ## scn, bidx, green_bits>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        template <int GREEN_BITS, int bidx> struct RGB5x52RGBConverter;    
+        template <int bidx> struct RGB5x52RGBConverter<5, bidx>
+        {
+            template <typename D> static __device__ void cvt(uint src, D& dst)
+            {            
+                (&dst.x)[bidx] = (uchar)(src << 3);
+                dst.y = (uchar)((src >> 2) & ~7);
+                (&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7);
+                setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0));
+            }
+        };
+        template <int bidx> struct RGB5x52RGBConverter<6, bidx>
+        {
+            template <typename D>
+            static __device__ void cvt(uint src, D& dst)
+            {            
+                (&dst.x)[bidx] = (uchar)(src << 3);
+                dst.y = (uchar)((src >> 3) & ~3);
+                (&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7);
+                setAlpha(dst, (uchar)(255));
+            }
+        };
+
+        template <typename D, int bidx, int GREEN_BITS> struct RGB5x52RGB : public unary_function<ushort, D>
+        {
+            __device__ __forceinline__ D operator()(ushort src) const
+            {
+                D dst;
+                RGB5x52RGBConverter<GREEN_BITS, bidx>::cvt((uint)src, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(name, dcn, bidx, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef detail::RGB5x52RGB<uchar ## dcn, bidx, green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB5x52RGB<uchar ## dcn, bidx, green_bits>(); \
+        } \
+    };
+
+///////////////////////////////// Grayscale to Color ////////////////////////////////
+
+    namespace detail
+    {
+        template <typename T, typename D> struct Gray2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                dst.z = dst.y = dst.x = src;            
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(name, dcn) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::Gray2RGB<T, typename TypeVec<T, dcn>::vec_type> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::Gray2RGB<T, typename TypeVec<T, dcn>::vec_type>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        template <int GREEN_BITS> struct Gray2RGB5x5Converter;
+        template<> struct Gray2RGB5x5Converter<6> 
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
+            }
+        };
+        template<> struct Gray2RGB5x5Converter<5> 
+        {
+            static __device__ __forceinline__ ushort cvt(uint t)
+            {
+                t >>= 3;
+                return (ushort)(t | (t << 5) | (t << 10));
+            }
+        };
+
+        template<int GREEN_BITS> struct Gray2RGB5x5 : public unary_function<uchar, ushort>
+        {
+            __device__ __forceinline__ ushort operator()(uchar src) const
+            {
+                return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef detail::Gray2RGB5x5<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::Gray2RGB5x5<green_bits>(); \
+        } \
+    };
+
+///////////////////////////////// Color to Grayscale ////////////////////////////////
+
+    namespace detail
+    {
+        template <int GREEN_BITS> struct RGB5x52GrayConverter;
+        template <> struct RGB5x52GrayConverter<6> 
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
+            }
+        };
+        template <> struct RGB5x52GrayConverter<5> 
+        {
+            static __device__ __forceinline__ uchar cvt(uint t)
+            {
+                return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
+            }
+        };   
+
+        template<int GREEN_BITS> struct RGB5x52Gray : public unary_function<ushort, uchar>
+        {
+            __device__ __forceinline__ uchar operator()(ushort src) const
+            {
+                return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(name, green_bits) \
+    struct name ## _traits \
+    { \
+        typedef detail::RGB5x52Gray<green_bits> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB5x52Gray<green_bits>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        template <int bidx, typename T> static __device__ __forceinline__ T RGB2GrayConvert(const T* src)
+        {
+            return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
+        }
+        template <int bidx> static __device__ __forceinline__ float RGB2GrayConvert(const float* src)
+        {
+            return src[bidx] * 0.114f + src[1] * 0.587f + src[bidx^2] * 0.299f;
+        }
+
+        template <typename T, typename D, int bidx> struct RGB2Gray : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                return RGB2GrayConvert<bidx>(&src.x);
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(name, scn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2Gray<typename TypeVec<T, scn>::vec_type, T, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2Gray<typename TypeVec<T, scn>::vec_type, T, bidx>(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+    namespace detail
+    {
+        __constant__ float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
+        __constant__ int   c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 };
+
+        template <int bidx, typename T, typename D> static __device__ void RGB2YUVConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YUVCoeffs_i[bidx^2] + src[1] * c_RGB2YUVCoeffs_i[1] + src[2] * c_RGB2YUVCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YUVCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YUVCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YUVConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YUVCoeffs_f[bidx^2] + src[1] * c_RGB2YUVCoeffs_f[1] + src[2] * c_RGB2YUVCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YUVCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YUVCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, typename D, int bidx> struct RGB2YUV : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator ()(const T& src) const
+            {
+                D dst;
+                RGB2YUVConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2YUV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2YUV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        __constant__ float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
+        __constant__ int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; 
+
+        template <int bidx, typename T, typename D> static __device__ void YUV2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YUV2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+        template <int bidx, typename T> static __device__ __forceinline__ void YUV2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YUV2RGBCoeffs_f[0];
+        }
+
+        template <typename T, typename D, int bidx> struct YUV2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator ()(const T& src) const
+            {
+                D dst;
+
+                YUV2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::YUV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::YUV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
+    
+    namespace detail
+    {
+        __constant__ float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+        __constant__ int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
+        
+        template <int bidx, typename T, typename D> static __device__ void RGB2YCrCbConvert(const T* src, D& dst)
+        {
+            const int delta = ColorChannel<T>::half() * (1 << yuv_shift);
+
+            const int Y = CV_DESCALE(src[0] * c_RGB2YCrCbCoeffs_i[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_i[1] + src[2] * c_RGB2YCrCbCoeffs_i[bidx], yuv_shift);
+            const int Cr = CV_DESCALE((src[bidx^2] - Y) * c_RGB2YCrCbCoeffs_i[3] + delta, yuv_shift);
+            const int Cb = CV_DESCALE((src[bidx] - Y) * c_RGB2YCrCbCoeffs_i[4] + delta, yuv_shift);
+
+            dst.x = saturate_cast<T>(Y);
+            dst.y = saturate_cast<T>(Cr);
+            dst.z = saturate_cast<T>(Cb);
+        }
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst)
+        {
+            dst.x = src[0] * c_RGB2YCrCbCoeffs_f[bidx^2] + src[1] * c_RGB2YCrCbCoeffs_f[1] + src[2] * c_RGB2YCrCbCoeffs_f[bidx];
+            dst.y = (src[bidx^2] - dst.x) * c_RGB2YCrCbCoeffs_f[3] + ColorChannel<float>::half();
+            dst.z = (src[bidx] - dst.x) * c_RGB2YCrCbCoeffs_f[4] + ColorChannel<float>::half();
+        }
+
+        template <typename T, typename D, int bidx> struct RGB2YCrCb : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator ()(const T& src) const
+            {
+                D dst;
+                RGB2YCrCbConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2YCrCb<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2YCrCb<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        __constant__ float c_YCrCb2RGBCoeffs_f[5] = {1.403f, -0.714f, -0.344f, 1.773f};
+        __constant__ int   c_YCrCb2RGBCoeffs_i[5] = {22987, -11698, -5636, 29049};
+
+        template <int bidx, typename T, typename D> static __device__ void YCrCb2RGBConvert(const T& src, D* dst)
+        {
+            const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[3], yuv_shift);
+            const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[1], yuv_shift);
+            const int r = src.x + CV_DESCALE((src.y - ColorChannel<D>::half()) * c_YCrCb2RGBCoeffs_i[0], yuv_shift);
+
+            dst[bidx] = saturate_cast<D>(b);
+            dst[1] = saturate_cast<D>(g);
+            dst[bidx^2] = saturate_cast<D>(r);
+        }
+        template <int bidx, typename T> __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[3];
+            dst[1] = src.x + (src.z - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[1];
+            dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * c_YCrCb2RGBCoeffs_f[0];
+        }
+
+        template <typename T, typename D, int bidx> struct YCrCb2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator ()(const T& src) const
+            {
+                D dst;
+
+                YCrCb2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::YCrCb2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::YCrCb2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
+
+    namespace detail
+    {
+        __constant__ float c_RGB2XYZ_D65f[9] = { 0.412453f, 0.357580f, 0.180423f, 0.212671f, 0.715160f, 0.072169f, 0.019334f, 0.119193f, 0.950227f };
+        __constant__ int   c_RGB2XYZ_D65i[9] = { 1689, 1465, 739, 871, 2929, 296, 79, 488, 3892 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
+        {
+            dst.x = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[0] + src[1] * c_RGB2XYZ_D65i[1] + src[bidx] * c_RGB2XYZ_D65i[2], xyz_shift));
+            dst.y = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[3] + src[1] * c_RGB2XYZ_D65i[4] + src[bidx] * c_RGB2XYZ_D65i[5], xyz_shift));
+            dst.z = saturate_cast<T>(CV_DESCALE(src[bidx^2] * c_RGB2XYZ_D65i[6] + src[1] * c_RGB2XYZ_D65i[7] + src[bidx] * c_RGB2XYZ_D65i[8], xyz_shift));
+        }
+        template <int bidx, typename D> static __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
+        {
+            dst.x = src[bidx^2] * c_RGB2XYZ_D65f[0] + src[1] * c_RGB2XYZ_D65f[1] + src[bidx] * c_RGB2XYZ_D65f[2];
+            dst.y = src[bidx^2] * c_RGB2XYZ_D65f[3] + src[1] * c_RGB2XYZ_D65f[4] + src[bidx] * c_RGB2XYZ_D65f[5];
+            dst.z = src[bidx^2] * c_RGB2XYZ_D65f[6] + src[1] * c_RGB2XYZ_D65f[7] + src[bidx] * c_RGB2XYZ_D65f[8];
+        }
+
+        template <typename T, typename D, int bidx> struct RGB2XYZ : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                RGB2XYZConvert<bidx>(&src.x, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2XYZ<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2XYZ<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        __constant__ float c_XYZ2sRGB_D65f[9] = { 3.240479f, -1.53715f, -0.498535f, -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f };
+        __constant__ int   c_XYZ2sRGB_D65i[9] = { 13273, -6296, -2042, -3970, 7684, 170, 228, -836, 4331 };
+
+        template <int bidx, typename T, typename D> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
+        {
+            dst[bidx^2] = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[0] + src.y * c_XYZ2sRGB_D65i[1] + src.z * c_XYZ2sRGB_D65i[2], xyz_shift));
+	        dst[1]      = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[3] + src.y * c_XYZ2sRGB_D65i[4] + src.z * c_XYZ2sRGB_D65i[5], xyz_shift));
+	        dst[bidx]   = saturate_cast<D>(CV_DESCALE(src.x * c_XYZ2sRGB_D65i[6] + src.y * c_XYZ2sRGB_D65i[7] + src.z * c_XYZ2sRGB_D65i[8], xyz_shift));
+        }
+        template <int bidx, typename T> static __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
+        {
+            dst[bidx^2] = src.x * c_XYZ2sRGB_D65f[0] + src.y * c_XYZ2sRGB_D65f[1] + src.z * c_XYZ2sRGB_D65f[2];
+	        dst[1]      = src.x * c_XYZ2sRGB_D65f[3] + src.y * c_XYZ2sRGB_D65f[4] + src.z * c_XYZ2sRGB_D65f[5];
+	        dst[bidx]   = src.x * c_XYZ2sRGB_D65f[6] + src.y * c_XYZ2sRGB_D65f[7] + src.z * c_XYZ2sRGB_D65f[8];
+        }
+
+        template <typename T, typename D, int bidx> struct XYZ2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                XYZ2RGBConvert<bidx>(src, &dst.x);
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::XYZ2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::XYZ2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx>(); \
+        } \
+    };
+
+////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
+
+    namespace detail
+    {
+        __constant__ int c_HsvDivTable   [256] = {0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211, 130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632, 65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412, 43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693, 32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782, 26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223, 21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991, 18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579, 16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711, 14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221, 13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006, 11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995, 10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141, 10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410, 9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777, 8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224, 8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737, 7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304, 7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917, 6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569, 6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254, 6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968, 5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708, 5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468, 5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249, 5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046, 5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858, 4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684, 4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522, 4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370, 4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229, 4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096};
+        __constant__ int c_HsvDivTable180[256] = {0, 122880, 61440, 40960, 30720, 24576, 20480, 17554, 15360, 13653, 12288, 11171, 10240, 9452, 8777, 8192, 7680, 7228, 6827, 6467, 6144, 5851, 5585, 5343, 5120, 4915, 4726, 4551, 4389, 4237, 4096, 3964, 3840, 3724, 3614, 3511, 3413, 3321, 3234, 3151, 3072, 2997, 2926, 2858, 2793, 2731, 2671, 2614, 2560, 2508, 2458, 2409, 2363, 2318, 2276, 2234, 2194, 2156, 2119, 2083, 2048, 2014, 1982, 1950, 1920, 1890, 1862, 1834, 1807, 1781, 1755, 1731, 1707, 1683, 1661, 1638, 1617, 1596, 1575, 1555, 1536, 1517, 1499, 1480, 1463, 1446, 1429, 1412, 1396, 1381, 1365, 1350, 1336, 1321, 1307, 1293, 1280, 1267, 1254, 1241, 1229, 1217, 1205, 1193, 1182, 1170, 1159, 1148, 1138, 1127, 1117, 1107, 1097, 1087, 1078, 1069, 1059, 1050, 1041, 1033, 1024, 1016, 1007, 999, 991, 983, 975, 968, 960, 953, 945, 938, 931, 924, 917, 910, 904, 897, 890, 884, 878, 871, 865, 859, 853, 847, 842, 836, 830, 825, 819, 814, 808, 803, 798, 793, 788, 783, 778, 773, 768, 763, 759, 754, 749, 745, 740, 736, 731, 727, 723, 719, 714, 710, 706, 702, 698, 694, 690, 686, 683, 679, 675, 671, 668, 664, 661, 657, 654, 650, 647, 643, 640, 637, 633, 630, 627, 624, 621, 617, 614, 611, 608, 605, 602, 599, 597, 594, 591, 588, 585, 582, 580, 577, 574, 572, 569, 566, 564, 561, 559, 556, 554, 551, 549, 546, 544, 541, 539, 537, 534, 532, 530, 527, 525, 523, 521, 518, 516, 514, 512, 510, 508, 506, 504, 502, 500, 497, 495, 493, 492, 490, 488, 486, 484, 482};
+        __constant__ int c_HsvDivTable256[256] = {0, 174763, 87381, 58254, 43691, 34953, 29127, 24966, 21845, 19418, 17476, 15888, 14564, 13443, 12483, 11651, 10923, 10280, 9709, 9198, 8738, 8322, 7944, 7598, 7282, 6991, 6722, 6473, 6242, 6026, 5825, 5638, 5461, 5296, 5140, 4993, 4855, 4723, 4599, 4481, 4369, 4263, 4161, 4064, 3972, 3884, 3799, 3718, 3641, 3567, 3495, 3427, 3361, 3297, 3236, 3178, 3121, 3066, 3013, 2962, 2913, 2865, 2819, 2774, 2731, 2689, 2648, 2608, 2570, 2533, 2497, 2461, 2427, 2394, 2362, 2330, 2300, 2270, 2241, 2212, 2185, 2158, 2131, 2106, 2081, 2056, 2032, 2009, 1986, 1964, 1942, 1920, 1900, 1879, 1859, 1840, 1820, 1802, 1783, 1765, 1748, 1730, 1713, 1697, 1680, 1664, 1649, 1633, 1618, 1603, 1589, 1574, 1560, 1547, 1533, 1520, 1507, 1494, 1481, 1469, 1456, 1444, 1432, 1421, 1409, 1398, 1387, 1376, 1365, 1355, 1344, 1334, 1324, 1314, 1304, 1295, 1285, 1276, 1266, 1257, 1248, 1239, 1231, 1222, 1214, 1205, 1197, 1189, 1181, 1173, 1165, 1157, 1150, 1142, 1135, 1128, 1120, 1113, 1106, 1099, 1092, 1085, 1079, 1072, 1066, 1059, 1053, 1046, 1040, 1034, 1028, 1022, 1016, 1010, 1004, 999, 993, 987, 982, 976, 971, 966, 960, 955, 950, 945, 940, 935, 930, 925, 920, 915, 910, 906, 901, 896, 892, 887, 883, 878, 874, 869, 865, 861, 857, 853, 848, 844, 840, 836, 832, 828, 824, 820, 817, 813, 809, 805, 802, 798, 794, 791, 787, 784, 780, 777, 773, 770, 767, 763, 760, 757, 753, 750, 747, 744, 741, 737, 734, 731, 728, 725, 722, 719, 716, 713, 710, 708, 705, 702, 699, 696, 694, 691, 688, 685};
+        
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const uchar* src, D& dst)
+        {
+            const int hsv_shift = 12;
+            const int* hdiv_table = hr == 180 ? c_HsvDivTable180 : c_HsvDivTable256;
+
+            int b = src[bidx], g = src[1], r = src[bidx^2];
+            int h, s, v = b;
+            int vmin = b, diff;
+            int vr, vg;
+
+            v = ::max(v, g);
+            v = ::max(v, r);
+            vmin = ::min(vmin, g);
+            vmin = ::min(vmin, r);
+
+            diff = v - vmin;
+            vr = v == r ? -1 : 0;
+            vg = v == g ? -1 : 0;
+
+            s = (diff * c_HsvDivTable[v] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
+            h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
+            h += h < 0 ? hr : 0;
+
+            dst.x = saturate_cast<uchar>(h);
+            dst.y = (uchar)s;
+            dst.z = (uchar)v;
+        }
+        template <int bidx, int hr, typename D> static __device__ void RGB2HSVConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h, s, v;
+
+            float vmin, diff;
+
+            v = vmin = r;
+            v = fmax(v, g);
+            v = fmax(v, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = v - vmin;
+            s = diff / (float)(::fabs(v) + numeric_limits<float>::epsilon());
+            diff = (float)(60. / (diff + numeric_limits<float>::epsilon()));
+
+            if (v == r)
+                h = (g - b) * diff;
+            else if (v == g)
+                h = (b - r) * diff + 120.f;
+            else
+                h = (r - g) * diff + 240.f;
+
+            if (h < 0) h += 360.f;
+
+            dst.x = h * hscale;
+            dst.y = s;
+            dst.z = v;
+        }
+
+        template <typename T, typename D, int bidx, int hr> struct RGB2HSV : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                RGB2HSVConvert<bidx, hr>(&src.x, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2HSV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HSV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180>(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef detail::RGB2HSV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HSV<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 256>(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef detail::RGB2HSV<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HSV<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef detail::RGB2HSV<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HSV<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        __constant__ int c_HsvSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.f / HR;
+            
+            float h = src.x, s = src.y, v = src.z;
+            float b, g, r;
+
+            if( s == 0 )
+                b = g = r = v;
+            else
+            {
+                float tab[4];
+                int sector;
+                h *= hscale;
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+                sector = __float2int_rd(h);
+                h -= sector;
+
+                tab[0] = v;
+                tab[1] = v*(1.f - s);
+                tab[2] = v*(1.f - s*h);
+                tab[3] = v*(1.f - s*(1.f - h));
+
+                b = tab[c_HsvSectorData[sector][0]];
+                g = tab[c_HsvSectorData[sector][1]];
+                r = tab[c_HsvSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+        template <int bidx, int HR, typename T> static __device__ void HSV2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y * (1.f/255.f);
+            buf.z = src.z * (1.f/255.f);
+
+            HSV2RGBConvert<bidx, HR>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x * 255.f);
+            dst[1] = saturate_cast<uchar>(buf.y * 255.f);
+            dst[2] = saturate_cast<uchar>(buf.z * 255.f);
+        }
+
+        template <typename T, typename D, int bidx, int HR> struct HSV2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                HSV2RGBConvert<bidx, HR>(src, &dst.x);
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::HSV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HSV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180>(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef detail::HSV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HSV2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 255>(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef detail::HSV2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HSV2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef detail::HSV2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HSV2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    };
+
+/////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
+
+    namespace detail
+    {
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const float* src, D& dst)
+        {
+            const float hscale = hr * (1.f / 360.f);
+
+            float b = src[bidx], g = src[1], r = src[bidx^2];
+            float h = 0.f, s = 0.f, l;
+            float vmin, vmax, diff;
+
+            vmax = vmin = r;
+            vmax = fmax(vmax, g);
+            vmax = fmax(vmax, b);
+            vmin = fmin(vmin, g);
+            vmin = fmin(vmin, b);
+
+            diff = vmax - vmin;
+            l = (vmax + vmin) * 0.5f;
+
+            if (diff > numeric_limits<float>::epsilon())
+            {
+                s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
+                diff = 60.f / diff;
+
+                if (vmax == r)
+                    h = (g - b)*diff;
+                else if (vmax == g)
+                    h = (b - r)*diff + 120.f;
+                else
+                    h = (r - g)*diff + 240.f;
+
+                if (h < 0.f) h += 360.f;
+            }
+
+            dst.x = h * hscale;
+            dst.y = l;
+            dst.z = s;
+        }
+        template <int bidx, int hr, typename D> static __device__ void RGB2HLSConvert(const uchar* src, D& dst)
+        {
+            float3 buf;
+
+            buf.x = src[0]*(1.f/255.f);
+            buf.y = src[1]*(1.f/255.f);
+            buf.z = src[2]*(1.f/255.f);
+
+            RGB2HLSConvert<bidx, hr>(&buf.x, buf);
+
+            dst.x = saturate_cast<uchar>(buf.x);
+            dst.y = saturate_cast<uchar>(buf.y*255.f);
+            dst.z = saturate_cast<uchar>(buf.z*255.f);
+        }
+
+        template <typename T, typename D, int bidx, int hr> struct RGB2HLS : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                RGB2HLSConvert<bidx, hr>(&src.x, dst);
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::RGB2HLS<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HLS<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180>(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef detail::RGB2HLS<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 256> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HLS<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 256>(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef detail::RGB2HLS<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HLS<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef detail::RGB2HLS<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::RGB2HLS<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    };
+
+    namespace detail
+    {
+        __constant__ int c_HlsSectorData[6][3] = { {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0} };
+
+        template <int bidx, int HR, typename T> static __device__ void HLS2RGBConvert(const T& src, float* dst)
+        {
+            const float hscale = 6.0f / HR;
+
+            float h = src.x, l = src.y, s = src.z;
+            float b, g, r;
+
+            if (s == 0)
+                b = g = r = l;
+            else
+            {
+                float tab[4];
+                int sector;
+
+                float p2 = l <= 0.5f ? l * (1 + s) : l + s - l * s;
+                float p1 = 2 * l - p2;
+
+                h *= hscale;
+
+                if( h < 0 )
+                    do h += 6; while( h < 0 );
+                else if( h >= 6 )
+                    do h -= 6; while( h >= 6 );
+
+                sector = __float2int_rd(h);
+                h -= sector;
+
+                tab[0] = p2;
+                tab[1] = p1;
+                tab[2] = p1 + (p2 - p1) * (1 - h);
+                tab[3] = p1 + (p2 - p1) * h;
+
+                b = tab[c_HlsSectorData[sector][0]];
+                g = tab[c_HlsSectorData[sector][1]];
+                r = tab[c_HlsSectorData[sector][2]];
+            }
+
+            dst[bidx] = b;
+            dst[1] = g;
+            dst[bidx^2] = r;
+        }
+        template <int bidx, int HR, typename T> static __device__ void HLS2RGBConvert(const T& src, uchar* dst)
+        {
+            float3 buf;
+
+            buf.x = src.x;
+            buf.y = src.y*(1.f/255.f);
+            buf.z = src.z*(1.f/255.f);
+
+            HLS2RGBConvert<bidx, HR>(buf, &buf.x);
+
+            dst[0] = saturate_cast<uchar>(buf.x*255.f);
+            dst[1] = saturate_cast<uchar>(buf.y*255.f);
+            dst[2] = saturate_cast<uchar>(buf.z*255.f);
+        }
+
+        template <typename T, typename D, int bidx, int HR> struct HLS2RGB : public unary_function<T, D>
+        {
+            __device__ __forceinline__ D operator()(const T& src) const
+            {
+                D dst;
+                HLS2RGBConvert<bidx, HR>(src, &dst.x);
+                setAlpha(dst, ColorChannel<typename VecTraits<T>::elem_type>::max());
+                return dst;
+            }
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(name, scn, dcn, bidx) \
+    template <typename T> struct name ## _traits \
+    { \
+        typedef detail::HLS2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HLS2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 180>(); \
+        } \
+    }; \
+    template <typename T> struct name ## _full_traits \
+    { \
+        typedef detail::HLS2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 255> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HLS2RGB<typename TypeVec<T, scn>::vec_type, typename TypeVec<T, dcn>::vec_type, bidx, 255>(); \
+        } \
+    }; \
+    template <> struct name ## _traits<float> \
+    { \
+        typedef detail::HLS2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HLS2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    }; \
+    template <> struct name ## _full_traits<float> \
+    { \
+        typedef detail::HLS2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360> functor_type; \
+        static __host__ __device__ __forceinline__ functor_type create_functor() \
+        { \
+            return detail::HLS2RGB<typename TypeVec<float, scn>::vec_type, typename TypeVec<float, dcn>::vec_type, bidx, 360>(); \
+        } \
+    };
+}}}
+
+#endif // __OPENCV_GPU_COLOR_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9fb653ddaf5cba64b1ad7b93eaafa339c9159a2e
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform.hpp
@@ -0,0 +1,429 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+
+#include "internal_shared.hpp"
+#include "vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace detail
+    {
+        //! Mask accessor
+
+        class MaskReader
+        {
+        public:
+            explicit MaskReader(const PtrStep& mask_): mask(mask_) {}
+
+            __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
+
+        private:
+            PtrStep mask;
+        };
+
+        struct NoMask 
+        {
+            __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
+        };
+
+        //! Read Write Traits
+
+        template <size_t src_elem_size, size_t dst_elem_size>
+        struct UnReadWriteTraits_
+        {
+            enum { shift = 1 };
+        };
+        template <size_t src_elem_size>
+        struct UnReadWriteTraits_<src_elem_size, 1>
+        {
+            enum { shift = 4 };
+        };
+        template <size_t src_elem_size>
+        struct UnReadWriteTraits_<src_elem_size, 2>
+        {
+            enum { shift = 2 };
+        };
+        template <typename T, typename D> struct UnReadWriteTraits
+        {
+            enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
+            
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
+        struct BinReadWriteTraits_
+        {
+            enum { shift = 1 };
+        };
+        template <size_t src_elem_size1, size_t src_elem_size2>
+        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
+        {
+            enum { shift = 4 };
+        };
+        template <size_t src_elem_size1, size_t src_elem_size2>
+        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
+        {
+            enum { shift = 2 };
+        };
+        template <typename T1, typename T2, typename D> struct BinReadWriteTraits
+        {
+            enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
+
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D , shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
+        {
+            typedef typename UnReadWriteTraits<T, D>::read_type read_type;
+            typedef typename UnReadWriteTraits<T, D>::write_type write_type;
+            const int shift = UnReadWriteTraits<T, D>::shift;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + shift - 1 < src_.cols)
+                {
+                    read_type src_n_el = ((const read_type*)src)[x];
+                    write_type dst_n_el;
+
+                    OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
+        {
+		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
+            const Mask mask, BinOp op)
+        {
+            typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
+            typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
+            typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
+            const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + shift - 1 < src1_.cols)
+                {
+                    read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                    write_type dst_n_el;
+                    
+                    OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, 
+            const Mask mask, BinOp op)
+        {
+		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                T1 src1_data = src1.ptr(y)[x];
+                T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }        
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+            {
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src.cols, threads.x);
+                grid.y = divUp(src.rows, threads.y);        
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() ); 
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+            {
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src1.cols, threads.x);
+                grid.y = divUp(src1.rows, threads.y);        
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );            
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+            {
+                const int shift = UnReadWriteTraits<T, D>::shift;
+
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);            
+
+                grid.x = divUp(src.cols, threads.x * shift);
+                grid.y = divUp(src.rows, threads.y);        
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+            {
+                const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+
+                dim3 threads(16, 16, 1);
+                dim3 grid(1, 1, 1);
+
+                grid.x = divUp(src1.cols, threads.x * shift);
+                grid.y = divUp(src1.rows, threads.y);        
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );            
+            }
+        };
+
+        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
+        {
+            static const bool value = false;
+        };
+        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
+        {
+            static const bool value = UnReadWriteTraits<T, D>::shift != 1;
+        };
+        template <typename T, typename D> struct UseSmartUn
+        {
+            static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
+        };
+
+        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
+        {
+            static const bool value = false;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
+        {
+            static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin
+        {
+            static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, cudaStream_t stream)
+        {
+            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, const Mask& mask, cudaStream_t stream)
+        {
+            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
+        }
+    }
+}}}
+
+#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/functional.hpp b/modules/gpu/src/opencv2/gpu/device/functional.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..28889945e6bfb279fcddffcecdaed1bde50b58ba
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -0,0 +1,338 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_FUNCTIONAL_HPP__
+#define __OPENCV_GPU_FUNCTIONAL_HPP__
+
+#include <thrust/functional.h>
+#include "internal_shared.hpp"
+#include "saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    using thrust::unary_function;
+    using thrust::binary_function;
+
+    using thrust::plus;
+    using thrust::minus;
+    using thrust::multiplies;
+    using thrust::divides;
+    using thrust::modulus;
+    using thrust::negate;
+    
+    using thrust::equal_to;
+    using thrust::not_equal_to;
+    using thrust::greater;
+    using thrust::less;
+    using thrust::greater_equal;
+    using thrust::less_equal;
+    
+    using thrust::logical_and;
+    using thrust::logical_or;
+    using thrust::logical_not;
+
+    using thrust::bit_and;
+    using thrust::bit_or;
+    using thrust::bit_xor;
+    template <typename T> struct bit_not : public unary_function<T, T>
+    {
+        __forceinline__ __device__ T operator ()(const T& v) const {return ~v;}
+    };
+
+    using thrust::identity;
+
+#define OPENCV_GPU_IMPLEMENT_MINMAX(name, type, op) \
+    template <> struct name<type> : public binary_function<type, type, type> \
+    { \
+        __forceinline__ __device__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
+    };
+
+    template <typename T> struct maximum : public binary_function<T, T, T>
+    {
+        __forceinline__ __device__ T operator()(const T& lhs, const T& rhs) const {return lhs < rhs ? rhs : lhs;}
+    };
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, max)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax)
+    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax)
+
+    template <typename T> struct minimum : public binary_function<T, T, T>
+    {
+        __forceinline__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
+    };
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, min)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, fmin)
+    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
+
+#undef OPENCV_GPU_IMPLEMENT_MINMAX
+    
+    using thrust::project1st;
+    using thrust::project2nd;
+
+    using thrust::unary_negate;
+    using thrust::not1;
+
+    using thrust::binary_negate;
+    using thrust::not2;
+
+#define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(func) \
+    template <typename T> struct func ## _func : public unary_function<T, float> \
+    { \
+        __forceinline__ __device__ float operator ()(const T& v) \
+        { \
+            return func ## f(v); \
+        } \
+    }; \
+    template <> struct func ## _func<double> : public unary_function<double, double> \
+    { \
+        __forceinline__ __device__ double operator ()(double v) \
+        { \
+            return func(v); \
+        } \
+    };
+#define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
+    template <typename T> struct func ## _func : public binary_function<T, T, float> \
+    { \
+        __forceinline__ __device__ float operator ()(const T& v1, const T& v2) \
+        { \
+            return func ## f(v1, v2); \
+        } \
+    }; \
+    template <> struct func ## _func<double> : public binary_function<double, double, double> \
+    { \
+        __forceinline__ __device__ double operator ()(double v1, double v2) \
+        { \
+            return func(v1, v2); \
+        } \
+    };
+
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)
+    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)
+
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)
+    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)
+
+#undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
+#undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
+
+    template<typename T> struct hypot_sqr_func : public binary_function<T, T, float> 
+    {
+        __forceinline__ __device__ T operator ()(T src1, T src2) const
+        {
+            return src1 * src1 + src2 * src2;
+        }
+    };
+
+    template <typename T, typename D> struct saturate_cast_func : public unary_function<T, D>
+    {
+        __forceinline__ __device__ D operator ()(const T& v)
+        {
+            return saturate_cast<D>(v);
+        }
+    };
+
+    template <typename T> struct thresh_binary_func : public unary_function<T, T>
+    {
+        __forceinline__ __host__ __device__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __forceinline__ __device__ T operator()(const T& src) const
+        {
+            return src > thresh ? maxVal : 0;
+        }
+
+        T thresh;
+        T maxVal;
+    };
+    template <typename T> struct thresh_binary_inv_func : public unary_function<T, T>
+    {
+        __forceinline__ __host__ __device__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+        __forceinline__ __device__ T operator()(const T& src) const
+        {
+            return src > thresh ? 0 : maxVal;
+        }
+
+        T thresh;
+        T maxVal;
+    };
+    template <typename T> struct thresh_trunc_func : public unary_function<T, T>
+    {
+        explicit __forceinline__ __host__ __device__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+
+        __forceinline__ __device__ T operator()(const T& src) const
+        {
+            return minimum<T>()(src, thresh);
+        }
+
+        T thresh;
+    };
+    template <typename T> struct thresh_to_zero_func : public unary_function<T, T>
+    {
+    public:
+        explicit __forceinline__ __host__ __device__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+
+        __forceinline__ __device__ T operator()(const T& src) const
+        {
+            return src > thresh ? src : 0;
+        }
+
+        T thresh;
+    };
+    template <typename T> struct thresh_to_zero_inv_func : public unary_function<T, T>
+    {
+    public:
+        explicit __forceinline__ __host__ __device__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
+
+        __forceinline__ __device__ T operator()(const T& src) const
+        {
+            return src > thresh ? 0 : src;
+        }
+
+        T thresh;
+    };
+
+    template <typename Op> struct binder1st : public unary_function<typename Op::second_argument_type, typename Op::result_type> 
+    {
+        __forceinline__ __host__ __device__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+
+        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::second_argument_type& a)
+        {
+            return op(arg1, a);
+        }
+
+        Op op;
+        typename Op::first_argument_type arg1;
+    };
+    template <typename Op, typename T> static __forceinline__ __host__ __device__ binder1st<Op> bind1st(const Op& op, const T& x)
+    {
+        return binder1st<Op>(op, typename Op::first_argument_type(x));
+    }
+    template <typename Op> struct binder2nd : public unary_function<typename Op::first_argument_type, typename Op::result_type> 
+    {
+        __forceinline__ __host__ __device__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+
+        __forceinline__ __device__ typename Op::result_type operator ()(const typename Op::first_argument_type& a)
+        {
+            return op(a, arg2);
+        }
+
+        Op op;
+        typename Op::second_argument_type arg2;
+    };
+    template <typename Op, typename T> static __forceinline__ __host__ __device__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    {
+        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+    }
+
+    template <typename T1, typename T2> struct BinOpTraits
+    {
+        typedef int argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, T>
+    {
+        typedef T argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, double>
+    {
+        typedef double argument_type;
+    };
+    template <typename T> struct BinOpTraits<double, T>
+    {
+        typedef double argument_type;
+    };
+    template <> struct BinOpTraits<double, double>
+    {
+        typedef double argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, float>
+    {
+        typedef float argument_type;
+    };
+    template <typename T> struct BinOpTraits<float, T>
+    {
+        typedef float argument_type;
+    };
+    template <> struct BinOpTraits<float, float>
+    {
+        typedef float argument_type;
+    };
+    template <> struct BinOpTraits<double, float>
+    {
+        typedef double argument_type;
+    };
+    template <> struct BinOpTraits<float, double>
+    {
+        typedef double argument_type;
+    };
+}}}
+
+#endif // __OPENCV_GPU_FUNCTIONAL_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp b/modules/gpu/src/opencv2/gpu/device/limits.hpp
similarity index 92%
rename from modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
rename to modules/gpu/src/opencv2/gpu/device/limits.hpp
index 4baa2f9307e33f8f36b30b5c663209219241aa20..b0b73f2e3590399b728739a6677f6e43420db9b3 100644
--- a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -45,7 +45,7 @@
 
 namespace cv { namespace gpu { namespace device
 {
-    template<class T> struct numeric_limits_gpu
+    template<class T> struct numeric_limits
     {
         typedef T type;
         __device__ __forceinline__ static type min()  { return type(); };
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed;
     };
 
-    template<> struct numeric_limits_gpu<bool>
+    template<> struct numeric_limits<bool>
     {
         typedef bool type;
         __device__ __forceinline__ static type min() { return false; };
@@ -73,7 +73,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = false;
     };
 
-    template<> struct numeric_limits_gpu<char>
+    template<> struct numeric_limits<char>
     {
         typedef char type;
         __device__ __forceinline__ static type min() { return CHAR_MIN; };
@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = (char)-1 == -1;
     };
 
-     template<> struct numeric_limits_gpu<signed char>
+     template<> struct numeric_limits<signed char>
     {
         typedef char type;
         __device__ __forceinline__ static type min() { return CHAR_MIN; };
@@ -101,7 +101,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = (signed char)-1 == -1;
     };
 
-    template<> struct numeric_limits_gpu<unsigned char>
+    template<> struct numeric_limits<unsigned char>
     {
         typedef unsigned char type;
         __device__ __forceinline__ static type min() { return 0; };
@@ -115,7 +115,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = false;
     };
 
-    template<> struct numeric_limits_gpu<short>
+    template<> struct numeric_limits<short>
     {
         typedef short type;
         __device__ __forceinline__ static type min() { return SHRT_MIN; };
@@ -129,7 +129,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = true;
     };
 
-    template<> struct numeric_limits_gpu<unsigned short>
+    template<> struct numeric_limits<unsigned short>
     {
         typedef unsigned short type;
         __device__ __forceinline__ static type min() { return 0; };
@@ -143,7 +143,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = false;
     };
 
-    template<> struct numeric_limits_gpu<int>
+    template<> struct numeric_limits<int>
     {
         typedef int type;
         __device__ __forceinline__ static type min() { return INT_MIN; };
@@ -158,7 +158,7 @@ namespace cv { namespace gpu { namespace device
     };
 
 
-    template<> struct numeric_limits_gpu<unsigned int>
+    template<> struct numeric_limits<unsigned int>
     {
         typedef unsigned int type;
         __device__ __forceinline__ static type min() { return 0; };
@@ -172,7 +172,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = false;
     };
 
-    template<> struct numeric_limits_gpu<long>
+    template<> struct numeric_limits<long>
     {
         typedef long type;
         __device__ __forceinline__ static type min() { return LONG_MIN; };
@@ -186,7 +186,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = true;
     };
 
-    template<> struct numeric_limits_gpu<unsigned long>
+    template<> struct numeric_limits<unsigned long>
     {
         typedef unsigned long type;
         __device__ __forceinline__ static type min() { return 0; };
@@ -200,7 +200,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = false;
     };
 
-    template<> struct numeric_limits_gpu<float>
+    template<> struct numeric_limits<float>
     {
         typedef float type;
         __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
@@ -214,7 +214,7 @@ namespace cv { namespace gpu { namespace device
         static const bool is_signed = true;
     };
 
-    template<> struct numeric_limits_gpu<double>
+    template<> struct numeric_limits<double>
     {
         typedef double type;
         __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
diff --git a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
index 34265242418d4c497e7659148c0e2e4312bc16ca..55c9cb99b832039e0ce30b60d40f3d97ad847e30 100644
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -45,128 +45,122 @@
 
 #include "internal_shared.hpp"
 
-namespace cv
+namespace cv { namespace gpu { namespace device
 {
-    namespace gpu
-    {
-        namespace device
-        {
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
 
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-            { return (uchar)max((int)v, 0); }
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-            { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-            { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-            { return (uchar)min(v, (uint)UCHAR_MAX); }
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-            { return saturate_cast<uchar>((uint)v); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    { return (uchar)max((int)v, 0); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    { return (uchar)min(v, (uint)UCHAR_MAX); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    { return saturate_cast<uchar>((uint)v); }
 
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
-            #else
-                return saturate_cast<uchar>((float)v);
-            #endif
-            }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
+    template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
 
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-            { return (schar)min((int)v, SCHAR_MAX); }
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-            { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
-            {
-                return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
-                            v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-            }
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
-            { return saturate_cast<schar>((int)v); }
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-            { return (schar)min(v, (uint)SCHAR_MAX); }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    { return (schar)min((int)v, SCHAR_MAX); }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    { return (schar)min((uint)v, (uint)SCHAR_MAX); }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
+                    v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+    }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    { return saturate_cast<schar>((int)v); }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    { return (schar)min(v, (uint)SCHAR_MAX); }
 
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-            template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
-            #else
-                return saturate_cast<schar>((float)v);
-            #endif
-            }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
+    template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {             
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
 
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-            { return (ushort)max((int)v, 0); }
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-            { return (ushort)max((int)v, 0); }
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-            { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-            { return (ushort)min(v, (uint)USHRT_MAX); }
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
-            #else
-                return saturate_cast<ushort>((float)v);
-            #endif
-            }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    { return (ushort)max((int)v, 0); }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    { return (ushort)max((int)v, 0); }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    { return (ushort)min(v, (uint)USHRT_MAX); }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
+    template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {             
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
 
-            template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
-            { return (short)min((int)v, SHRT_MAX); }
-            template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
-            {
-                return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-                        v : v > 0 ? SHRT_MAX : SHRT_MIN);
-            }
-            template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
-            { return (short)min(v, (uint)SHRT_MAX); }
-            template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-            template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<short>(iv);
-            #else
-                return saturate_cast<short>((float)v);
-            #endif
-            }
+    template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    { return (short)min((int)v, SHRT_MAX); }
+    template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
+                v : v > 0 ? SHRT_MAX : SHRT_MIN);
+    }
+    template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
+    { return (short)min(v, (uint)SHRT_MAX); }
+    template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
+    { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
+    template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
+    {            
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v); return saturate_cast<short>(iv);
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
 
-            template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-            template<> static __device__ __forceinline__ int saturate_cast<int>(double v) 
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
-                return __double2int_rn(v);
-            #else
-                return saturate_cast<int>((float)v);
-            #endif
-            }
+    template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+    template<> static __device__ __forceinline__ int saturate_cast<int>(double v) 
+    {
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
 
-            template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-            template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v) 
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                return __double2uint_rn(v);
-            #else
-                return saturate_cast<uint>((float)v);
-            #endif
-            }
-        }
+    template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+    template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v) 
+    {            
+    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
     }
-}
+}}}
 
 #endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp
index 65d4ad9bc9408df88e4f46d63209be4e9b10cc4f..f2e447269e0fd1e9c02394795ef54d5dabc78c85 100644
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -43,421 +43,34 @@
 #ifndef __OPENCV_GPU_TRANSFORM_HPP__
 #define __OPENCV_GPU_TRANSFORM_HPP__
 
-#include "internal_shared.hpp"
-#include "vecmath.hpp"
+#include "detail/transform.hpp"
 
 namespace cv { namespace gpu { namespace device
 {
-    //! Mask accessor
-
-    class MaskReader
-    {
-    public:
-        explicit MaskReader(const PtrStep& mask_): mask(mask_) {}
-
-        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
-
-    private:
-        PtrStep mask;
-    };
-
-    struct NoMask 
-    {
-        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
-    };
-
-    //! Read Write Traits
-
-    template <size_t src_elem_size, size_t dst_elem_size>
-    struct UnReadWriteTraits_
-    {
-        enum { shift = 1 };
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 1>
-    {
-        enum { shift = 4 };
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 2>
-    {
-        enum { shift = 2 };
-    };
-    template <typename T, typename D> struct UnReadWriteTraits
-    {
-        enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
-        
-        typedef typename TypeVec<T, shift>::vec_t read_type;
-        typedef typename TypeVec<D, shift>::vec_t write_type;
-    };
-
-    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
-    struct BinReadWriteTraits_
-    {
-        enum { shift = 1 };
-    };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
-    {
-        enum { shift = 4 };
-    };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
-    {
-        enum { shift = 2 };
-    };
-    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
-    {
-        enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
-
-        typedef typename TypeVec<T1, shift>::vec_t read_type1;
-        typedef typename TypeVec<T2, shift>::vec_t read_type2;
-        typedef typename TypeVec<D , shift>::vec_t write_type;
-    };
-
-    //! Transform kernels
-
-    template <int shift> struct OpUnroller;
-    template <> struct OpUnroller<1>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-        }
-    };
-    template <> struct OpUnroller<2>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-        }
-    };
-    template <> struct OpUnroller<3>
-    {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
-        }
-    };
-    template <> struct OpUnroller<4>
+    template <typename T, typename D, typename UnOp>
+    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream = 0)
     {
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
-            if (mask(y, x_shifted + 3))
-                dst.w = op(src.w);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
-            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
-            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
-            if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
-            if (mask(y, x_shifted + 3))
-                dst.w = op(src1.w, src2.w);
-        }
-    };
-
-    template <typename T, typename D, typename UnOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
-    {
-        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
-        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
-        const int shift = UnReadWriteTraits<T, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src_.rows)
-        {
-            const T* src = src_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src_.cols)
-            {
-                read_type src_n_el = ((const read_type*)src)[x];
-                write_type dst_n_el;
-
-                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src[real_x]);
-                }
-            }
-        }
+        detail::transform_caller(src, dst, op, detail::NoMask(), stream);
     }
-
-    template <typename T, typename D, typename UnOp, typename Mask>
-    static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, UnOp op)
+    template <typename T, typename D, typename UnOp>
+    static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, UnOp op, 
+        cudaStream_t stream = 0)
     {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < src.cols && y < src.rows && mask(y, x))
-        {
-            dst.ptr(y)[x] = op(src.ptr(y)[x]);
-        }
+        detail::transform_caller(src, dst, op, detail::MaskReader(mask), stream);
     }
 
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
-        const Mask mask, BinOp op)
+    template <typename T1, typename T2, typename D, typename BinOp>
+    static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+        BinOp op, cudaStream_t stream = 0)
     {
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
-        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
-        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src1_.rows)
-        {
-            const T1* src1 = src1_.ptr(y);
-            const T2* src2 = src2_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src1_.cols)
-            {
-                read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                write_type dst_n_el;
-                
-                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src1[real_x], src2[real_x]);
-                }
-            }
-        }
+        detail::transform_caller(src1, src2, dst, op, detail::NoMask(), stream);
     }
-
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, 
-        const Mask mask, BinOp op)
+    template <typename T1, typename T2, typename D, typename BinOp>
+    static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
+        const PtrStep& mask, BinOp op, cudaStream_t stream = 0)
     {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < src1.cols && y < src1.rows && mask(y, x))
-        {
-            T1 src1_data = src1.ptr(y)[x];
-            T2 src2_data = src2.ptr(y)[x];
-            dst.ptr(y)[x] = op(src1_data, src2_data);
-        }
-    }  
-}}}
-
-namespace cv 
-{ 
-    namespace gpu 
-    {
-        template <bool UseSmart> struct TransformDispatcher;
-        template<> struct TransformDispatcher<false>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
-                             cudaStream_t stream = 0)
-            {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
-
-                grid.x = divUp(src.cols, threads.x);
-                grid.y = divUp(src.rows, threads.y);        
-
-                device::transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() ); 
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-                             BinOp op, const Mask& mask, cudaStream_t stream = 0)
-            {
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
-
-                grid.x = divUp(src1.cols, threads.x);
-                grid.y = divUp(src1.rows, threads.y);        
-
-                device::transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );            
-            }
-        };
-        template<> struct TransformDispatcher<true>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
-                             cudaStream_t stream = 0)
-            {
-                const int shift = device::UnReadWriteTraits<T, D>::shift;
-
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);            
-
-                grid.x = divUp(src.cols, threads.x * shift);
-                grid.y = divUp(src.rows, threads.y);        
-
-                device::transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-                             BinOp op, const Mask& mask, cudaStream_t stream = 0)
-            {
-                const int shift = device::BinReadWriteTraits<T1, T2, D>::shift;
-
-                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);
-
-                grid.x = divUp(src1.cols, threads.x * shift);
-                grid.y = divUp(src1.rows, threads.y);        
-
-                device::transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );            
-            }
-        };
-
-        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
-        {
-            static const bool value = false;
-        };
-        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
-        {
-            static const bool value = device::UnReadWriteTraits<T, D>::shift != 1;
-        };
-        template <typename T, typename D> struct UseSmartUn
-        {
-            static const bool value = UseSmartUn_<T, D, device::VecTraits<T>::cn, device::VecTraits<D>::cn>::value;
-        };
-
-        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
-        {
-            static const bool value = false;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
-        {
-            static const bool value = device::BinReadWriteTraits<T1, T2, D>::shift != 1;
-        };
-        template <typename T1, typename T2, typename D> struct UseSmartBin
-        {
-            static const bool value = UseSmartBin_<T1, T2, D, device::VecTraits<T1>::cn, device::VecTraits<T2>::cn, device::VecTraits<D>::cn>::value;
-        };
-
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
-            cudaStream_t stream = 0)
-        {
-            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
-        }
-
-        template <typename T, typename D, typename UnOp>
-        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream = 0)
-        {
-            transform_caller(src, dst, op, device::NoMask(), stream);
-        }
-        template <typename T, typename D, typename UnOp>
-        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStep& mask, UnOp op, 
-            cudaStream_t stream = 0)
-        {
-            transform_caller(src, dst, op, device::MaskReader(mask), stream);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-            BinOp op, const Mask& mask, cudaStream_t stream = 0)
-        {
-            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp>
-        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-            BinOp op, cudaStream_t stream = 0)
-        {
-            transform_caller(src1, src2, dst, op, device::NoMask(), stream);
-        }
-        template <typename T1, typename T2, typename D, typename BinOp>
-        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
-            const PtrStep& mask, BinOp op, cudaStream_t stream = 0)
-        {
-            transform_caller(src1, src2, dst, op, device::MaskReader(mask), stream);
-        }
+        detail::transform_caller(src1, src2, dst, op, detail::MaskReader(mask), stream);
     }
-}
+}}}
 
 #endif // __OPENCV_GPU_TRANSFORM_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/utility.hpp b/modules/gpu/src/opencv2/gpu/device/utility.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..edfbae01acd816d15f22149f2427fd7a7bc85bfd
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -0,0 +1,206 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_UTILITY_HPP__
+#define __OPENCV_GPU_UTILITY_HPP__
+
+#include "internal_shared.hpp"
+#include "saturate_cast.hpp"
+
+#ifndef __CUDA_ARCH__
+	#define __CUDA_ARCH__ 0
+#endif
+
+#define OPENCV_GPU_LOG_WARP_SIZE	    (5)
+#define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
+#define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+#define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
+
+#if defined(_WIN64) || defined(__LP64__)		
+    // 64-bit register modifier for inlined asm
+    #define OPENCV_GPU_ASM_PTR "l"
+#else	
+    // 32-bit register modifier for inlined asm
+    #define OPENCV_GPU_ASM_PTR "r"
+#endif
+
+namespace cv {  namespace gpu { namespace device
+{
+    template <typename T> void __host__ __device__ __forceinline__ swap(T &a, T &b) 
+    {
+        T temp = a;
+        a = b;
+        b = temp;
+    }
+
+    // warp-synchronous 32 elements reduction
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce32(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    {
+        data[tid] = partial_reduction;
+
+        if (tid < 16)
+        {
+            data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+            data[tid] = partial_reduction = op(partial_reduction, data[tid + 8 ]);
+            data[tid] = partial_reduction = op(partial_reduction, data[tid + 4 ]);
+            data[tid] = partial_reduction = op(partial_reduction, data[tid + 2 ]);
+            data[tid] = partial_reduction = op(partial_reduction, data[tid + 1 ]);
+        }
+    }
+
+    // warp-synchronous 16 elements reduction
+    template <typename T, typename Op> __device__ __forceinline__ void warpReduce16(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    {
+        data[tid] = partial_reduction;
+
+        if (tid < 8)
+        {
+            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 8 ]);
+            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 4 ]);
+            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 2 ]);
+            data[tid] = partial_reduction = op(partial_reduction, (T)data[tid + 1 ]);
+        }
+    }
+
+    // warp-synchronous reduction
+    template <int n, typename T, typename Op> __device__ __forceinline__ void warpReduce(volatile T* data, volatile T& partial_reduction, int tid, Op op)
+    {
+        if (tid < n)
+            data[tid] = partial_reduction;
+
+        if (n > 16)
+        {
+            if (tid < n - 16) 
+                data[tid] = partial_reduction = op(partial_reduction, data[tid + 16]);
+            if (tid < 8)
+            {
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+            }
+        }
+        else if (n > 8)
+        {
+            if (tid < n - 8) 
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  8]);
+            if (tid < 4)
+            {
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+            }
+        }
+        else if (n > 4)
+        {
+            if (tid < n - 4) 
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  4]);
+            if (tid < 2)
+            {
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+            }
+        }   
+        else if (n > 2)
+        {
+            if (tid < n - 2) 
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  2]);
+            if (tid < 2)
+            {
+                data[tid] = partial_reduction = op(partial_reduction, data[tid +  1]);
+            }
+        }      
+    }
+
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    {
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    {
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * 
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+            x[1] = saturate_cast<T>(invdet * 
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+            x[2] = saturate_cast<T>(invdet * 
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+            return true;
+        }
+
+        return false;
+    }
+}}}
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..238f5c771e83a4133d36eae947944af57e450039
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -0,0 +1,287 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VECMATH_HPP__
+#define __OPENCV_GPU_VECMATH_HPP__
+
+#include "internal_shared.hpp"
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "functional.hpp"
+
+namespace cv {  namespace gpu { namespace device
+{
+    namespace detail
+    {
+        template <int cn, typename VecD> struct SatCastHelper;
+        template <typename VecD> struct SatCastHelper<1, VecD>
+        {
+            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<2, VecD>
+        {
+            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<3, VecD>
+        {
+            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<4, VecD>
+        {
+            template <typename VecS> static __device__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+            }
+        };
+
+        template <typename VecD, typename VecS> static __device__ VecD saturate_cast_caller(const VecS& v)
+        {
+            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+        }
+    }
+
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+
+#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
+    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
+    static __device__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+    } \
+    static __device__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    { \
+        func<typename BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+    } \
+    template <typename T> \
+    static __device__ typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    { \
+        func<typename BinOpTraits<T, type>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator +, plus) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator -, minus) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator *, multiplies) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator /, divides) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator -, negate) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ==, equal_to) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator !=, not_equal_to) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator > , greater) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator < , less) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator >=, greater_equal) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator <=, less_equal) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &&, logical_and) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ||, logical_or) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp10, exp10_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log, log_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log2, log2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log10, log10_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sin, sin_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cos, cos_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tan, tan_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asin, asin_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acos, acos_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atan, atan_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sinh, sinh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cosh, cosh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tanh, tanh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asinh, asinh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acosh, acosh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atanh, atanh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot, hypot_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, atan2, atan2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, pow, pow_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot_sqr, hypot_sqr_func)
+
+#define OPENCV_GPU_IMPLEMENT_VEC_INT_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &, bit_and) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator |, bit_or) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
+
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
+
+#undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
+#undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
+#undef OPENCV_GPU_IMPLEMENT_VEC_OP
+#undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
+}}}
+        
+#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
diff --git a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..62d3710620d5bdfc56c2b6817e8312a87c3a1115
--- /dev/null
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -0,0 +1,142 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
+#define __OPENCV_GPU_VEC_TRAITS_HPP__
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; };
+
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+
+#undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_GPU_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ type all(type v) {return v;} \
+        static __device__ __host__ type make(type x) {return x;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+    };
+
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(char)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+
+#undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<schar> 
+    { 
+        typedef schar elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ schar all(schar v) {return v;}
+        static __device__ __host__ schar make(schar x) {return x;}
+    };
+}}}
+
+#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
diff --git a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
deleted file mode 100644
index 8456861b9008ac00b8fc7b74e39ca4eaf6e8f427..0000000000000000000000000000000000000000
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ /dev/null
@@ -1,1097 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_VECMATH_HPP__
-#define __OPENCV_GPU_VECMATH_HPP__
-
-#include "internal_shared.hpp"
-#include "saturate_cast.hpp"
-
-namespace cv
-{
-    namespace gpu
-    {
-        namespace device
-        {
-            template<typename T, int N> struct TypeVec;
-
-            template<> struct TypeVec<uchar, 1> { typedef uchar vec_t; };
-            template<> struct TypeVec<uchar1, 1> { typedef uchar1 vec_t; };
-            template<> struct TypeVec<uchar, 2> { typedef uchar2 vec_t; };
-            template<> struct TypeVec<uchar2, 2> { typedef uchar2 vec_t; };
-            template<> struct TypeVec<uchar, 3> { typedef uchar3 vec_t; };
-            template<> struct TypeVec<uchar3, 3> { typedef uchar3 vec_t; };
-            template<> struct TypeVec<uchar, 4> { typedef uchar4 vec_t; };
-            template<> struct TypeVec<uchar4, 4> { typedef uchar4 vec_t; };
-
-            template<> struct TypeVec<char, 1> { typedef char vec_t; };
-            template<> struct TypeVec<schar, 1> { typedef char vec_t; };
-            template<> struct TypeVec<char1, 1> { typedef char1 vec_t; };
-            template<> struct TypeVec<char, 2> { typedef char2 vec_t; };
-            template<> struct TypeVec<schar, 2> { typedef char2 vec_t; };
-            template<> struct TypeVec<char2, 2> { typedef char2 vec_t; };
-            template<> struct TypeVec<char, 3> { typedef char3 vec_t; };
-            template<> struct TypeVec<schar, 3> { typedef char3 vec_t; };
-            template<> struct TypeVec<char3, 3> { typedef char3 vec_t; };
-            template<> struct TypeVec<char, 4> { typedef char4 vec_t; };
-            template<> struct TypeVec<schar, 4> { typedef char4 vec_t; };
-            template<> struct TypeVec<char4, 4> { typedef char4 vec_t; };
-
-            template<> struct TypeVec<ushort, 1> { typedef ushort vec_t; };
-            template<> struct TypeVec<ushort1, 1> { typedef ushort1 vec_t; };
-            template<> struct TypeVec<ushort, 2> { typedef ushort2 vec_t; };
-            template<> struct TypeVec<ushort2, 2> { typedef ushort2 vec_t; };
-            template<> struct TypeVec<ushort, 3> { typedef ushort3 vec_t; };
-            template<> struct TypeVec<ushort3, 3> { typedef ushort3 vec_t; };
-            template<> struct TypeVec<ushort, 4> { typedef ushort4 vec_t; };
-            template<> struct TypeVec<ushort4, 4> { typedef ushort4 vec_t; };
-
-            template<> struct TypeVec<short, 1> { typedef short vec_t; };
-            template<> struct TypeVec<short1, 1> { typedef short1 vec_t; };
-            template<> struct TypeVec<short, 2> { typedef short2 vec_t; };
-            template<> struct TypeVec<short2, 2> { typedef short2 vec_t; };
-            template<> struct TypeVec<short, 3> { typedef short3 vec_t; };
-            template<> struct TypeVec<short3, 3> { typedef short3 vec_t; };
-            template<> struct TypeVec<short, 4> { typedef short4 vec_t; };
-            template<> struct TypeVec<short4, 4> { typedef short4 vec_t; };
-
-            template<> struct TypeVec<uint, 1> { typedef uint vec_t; };
-            template<> struct TypeVec<uint1, 1> { typedef uint1 vec_t; };
-            template<> struct TypeVec<uint, 2> { typedef uint2 vec_t; };
-            template<> struct TypeVec<uint2, 2> { typedef uint2 vec_t; };
-            template<> struct TypeVec<uint, 3> { typedef uint3 vec_t; };
-            template<> struct TypeVec<uint3, 3> { typedef uint3 vec_t; };
-            template<> struct TypeVec<uint, 4> { typedef uint4 vec_t; };
-            template<> struct TypeVec<uint4, 4> { typedef uint4 vec_t; };
-
-            template<> struct TypeVec<int, 1> { typedef int vec_t; };
-            template<> struct TypeVec<int1, 1> { typedef int1 vec_t; };
-            template<> struct TypeVec<int, 2> { typedef int2 vec_t; };
-            template<> struct TypeVec<int2, 2> { typedef int2 vec_t; };
-            template<> struct TypeVec<int, 3> { typedef int3 vec_t; };
-            template<> struct TypeVec<int3, 3> { typedef int3 vec_t; };
-            template<> struct TypeVec<int, 4> { typedef int4 vec_t; };
-            template<> struct TypeVec<int4, 4> { typedef int4 vec_t; };
-
-            template<> struct TypeVec<float, 1> { typedef float vec_t; };
-            template<> struct TypeVec<float1, 1> { typedef float1 vec_t; };
-            template<> struct TypeVec<float, 2> { typedef float2 vec_t; };
-            template<> struct TypeVec<float2, 2> { typedef float2 vec_t; };
-            template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
-            template<> struct TypeVec<float3, 3> { typedef float3 vec_t; };
-            template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
-            template<> struct TypeVec<float4, 4> { typedef float4 vec_t; };
-
-            template<> struct TypeVec<double, 1> { typedef double vec_t; };
-            template<> struct TypeVec<double1, 1> { typedef double1 vec_t; };
-            template<> struct TypeVec<double, 2> { typedef double2 vec_t; };
-            template<> struct TypeVec<double2, 2> { typedef double2 vec_t; };
-            template<> struct TypeVec<double, 3> { typedef double3 vec_t; };
-            template<> struct TypeVec<double3, 3> { typedef double3 vec_t; };
-            template<> struct TypeVec<double, 4> { typedef double4 vec_t; };
-            template<> struct TypeVec<double4, 4> { typedef double4 vec_t; };
-
-            template<typename T> struct VecTraits;
-
-            template<> struct VecTraits<uchar> 
-            { 
-                typedef uchar elem_t; 
-                enum {cn=1};
-                static __device__ __forceinline__ __host__ uchar all(uchar v) {return v;}
-                static __device__ __forceinline__ __host__ uchar make(uchar x) {return x;}
-            };
-            template<> struct VecTraits<uchar1> 
-            { 
-                typedef uchar elem_t; 
-                enum {cn=1};
-                static __device__ __forceinline__ __host__ uchar1 all(uchar v) {return make_uchar1(v);}
-                static __device__ __forceinline__ __host__ uchar1 make(uchar x) {return make_uchar1(x);}
-            };
-            template<> struct VecTraits<uchar2> 
-            { 
-                typedef uchar elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ uchar2 all(uchar v) {return make_uchar2(v, v);}
-                static __device__ __forceinline__ __host__ uchar2 make(uchar x, uchar y) {return make_uchar2(x, y);}
-            };
-            template<> struct VecTraits<uchar3> 
-            { 
-                typedef uchar elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ uchar3 all(uchar v) {return make_uchar3(v, v, v);}
-                static __device__ __forceinline__ __host__ uchar3 make(uchar x, uchar y, uchar z) {return make_uchar3(x, y, z);}
-            };
-            template<> struct VecTraits<uchar4> 
-            { 
-                typedef uchar elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ uchar4 all(uchar v) {return make_uchar4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ uchar4 make(uchar x, uchar y, uchar z, uchar w) {return make_uchar4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<char> 
-            { 
-                typedef char elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ char all(char v) {return v;}
-                static __device__ __forceinline__ __host__ char make(char x) {return x;}
-            };
-            template<> struct VecTraits<schar> 
-            { 
-                typedef schar elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ schar all(schar v) {return v;}
-                static __device__ __forceinline__ __host__ schar make(schar x) {return x;}
-            };
-            template<> struct VecTraits<char1> 
-            { 
-                typedef schar elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ char1 all(schar v) {return make_char1(v);}
-                static __device__ __forceinline__ __host__ char1 make(schar x) {return make_char1(x);}
-            };
-            template<> struct VecTraits<char2> 
-            { 
-                typedef schar elem_t; 
-                enum {cn=2}; 
-                static  __device__ __forceinline__ __host__ char2 all(schar v) {return make_char2(v, v);}
-                static  __device__ __forceinline__ __host__ char2 make(schar x, schar y) {return make_char2(x, y);}
-            };
-            template<> struct VecTraits<char3> 
-            { 
-                typedef schar elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ char3 all(schar v) {return make_char3(v, v, v);}
-                static __device__ __forceinline__ __host__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
-            };
-            template<> struct VecTraits<char4> 
-            { 
-                typedef schar elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ char4 all(schar v) {return make_char4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<ushort> 
-            { 
-                typedef ushort elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ ushort all(ushort v) {return v;}
-                static __device__ __forceinline__ __host__ ushort make(ushort x) {return x;}
-            };
-            template<> struct VecTraits<ushort1> 
-            { 
-                typedef ushort elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ ushort1 all(ushort v) {return make_ushort1(v);}
-                static __device__ __forceinline__ __host__ ushort1 make(ushort x) {return make_ushort1(x);}
-            };
-            template<> struct VecTraits<ushort2> 
-            { 
-                typedef ushort elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ ushort2 all(ushort v) {return make_ushort2(v, v);}
-                static __device__ __forceinline__ __host__ ushort2 make(ushort x, ushort y) {return make_ushort2(x, y);}
-            };
-            template<> struct VecTraits<ushort3> 
-            { 
-                typedef ushort elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ ushort3 all(ushort v) {return make_ushort3(v, v, v);}
-                static __device__ __forceinline__ __host__ ushort3 make(ushort x, ushort y, ushort z) {return make_ushort3(x, y, z);}
-            };
-            template<> struct VecTraits<ushort4> 
-            { 
-                typedef ushort elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ ushort4 all(ushort v) {return make_ushort4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ ushort4 make(ushort x, ushort y, ushort z, ushort w) {return make_ushort4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<short> 
-            { 
-                typedef short elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ short all(short v) {return v;}
-                static __device__ __forceinline__ __host__ short make(short x) {return x;}
-            };
-            template<> struct VecTraits<short1> 
-            { 
-                typedef short elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ short1 all(short v) {return make_short1(v);}
-                static __device__ __forceinline__ __host__ short1 make(short x) {return make_short1(x);}
-            };
-            template<> struct VecTraits<short2> 
-            { 
-                typedef short elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ short2 all(short v) {return make_short2(v, v);}
-                static __device__ __forceinline__ __host__ short2 make(short x, short y) {return make_short2(x, y);}
-            };
-            template<> struct VecTraits<short3> 
-            { 
-                typedef short elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ short3 all(short v) {return make_short3(v, v, v);}
-                static __device__ __forceinline__ __host__ short3 make(short x, short y, short z) {return make_short3(x, y, z);}
-            };
-            template<> struct VecTraits<short4> 
-            { 
-                typedef short elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ short4 all(short v) {return make_short4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ short4 make(short x, short y, short z, short w) {return make_short4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<uint> 
-            { 
-                typedef uint elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ uint all(uint v) {return v;}
-                static __device__ __forceinline__ __host__ uint make(uint x) {return x;}
-            };
-            template<> struct VecTraits<uint1> 
-            { 
-                typedef uint elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ uint1 all(uint v) {return make_uint1(v);}
-                static __device__ __forceinline__ __host__ uint1 make(uint x) {return make_uint1(x);}
-            };
-            template<> struct VecTraits<uint2> 
-            { 
-                typedef uint elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ uint2 all(uint v) {return make_uint2(v, v);}
-                static __device__ __forceinline__ __host__ uint2 make(uint x, uint y) {return make_uint2(x, y);}
-            };
-            template<> struct VecTraits<uint3> 
-            { 
-                typedef uint elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ uint3 all(uint v) {return make_uint3(v, v, v);}
-                static __device__ __forceinline__ __host__ uint3 make(uint x, uint y, uint z) {return make_uint3(x, y, z);}
-            };
-            template<> struct VecTraits<uint4> 
-            { 
-                typedef uint elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ uint4 all(uint v) {return make_uint4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ uint4 make(uint x, uint y, uint z, uint w) {return make_uint4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<int> 
-            { 
-                typedef int elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ int all(int v) {return v;}
-                static __device__ __forceinline__ __host__ int make(int x) {return x;}
-            };
-            template<> struct VecTraits<int1> 
-            { 
-                typedef int elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ int1 all(int v) {return make_int1(v);}
-                static __device__ __forceinline__ __host__ int1 make(int x) {return make_int1(x);}
-            };
-            template<> struct VecTraits<int2> 
-            { 
-                typedef int elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ int2 all(int v) {return make_int2(v, v);}
-                static __device__ __forceinline__ __host__ int2 make(int x, int y) {return make_int2(x, y);}
-            };
-            template<> struct VecTraits<int3> 
-            { 
-                typedef int elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ int3 all(int v) {return make_int3(v, v, v);}
-                static __device__ __forceinline__ __host__ int3 make(int x, int y, int z) {return make_int3(x, y, z);}
-            };
-            template<> struct VecTraits<int4> 
-            { 
-                typedef int elem_t; 
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ int4 all(int v) {return make_int4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ int4 make(int x, int y, int z, int w) {return make_int4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<float> 
-            { 
-                typedef float elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ float all(float v) {return v;}
-                static __device__ __forceinline__ __host__ float make(float x) {return x;}
-            };
-            template<> struct VecTraits<float1> 
-            { 
-                typedef float elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ float1 all(float v) {return make_float1(v);}
-                static __device__ __forceinline__ __host__ float1 make(float x) {return make_float1(x);}
-            };
-            template<> struct VecTraits<float2> 
-            { 
-                typedef float elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ float2 all(float v) {return make_float2(v, v);}
-                static __device__ __forceinline__ __host__ float2 make(float x, float y) {return make_float2(x, y);}
-            };
-            template<> struct VecTraits<float3> 
-            { 
-                typedef float elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ float3 all(float v) {return make_float3(v, v, v);}
-                static __device__ __forceinline__ __host__ float3 make(float x, float y, float z) {return make_float3(x, y, z);}
-            };
-            template<> struct VecTraits<float4> 
-            { 
-                typedef float elem_t;
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ float4 all(float v) {return make_float4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ float4 make(float x, float y, float z, float w) {return make_float4(x, y, z, w);}
-            };
-
-            template<> struct VecTraits<double> 
-            { 
-                typedef double elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ double all(double v) {return v;}
-                static __device__ __forceinline__ __host__ double make(double x) {return x;}
-            };
-            template<> struct VecTraits<double1> 
-            { 
-                typedef double elem_t; 
-                enum {cn=1}; 
-                static __device__ __forceinline__ __host__ double1 all(double v) {return make_double1(v);}
-                static __device__ __forceinline__ __host__ double1 make(double x) {return make_double1(x);}
-            };
-            template<> struct VecTraits<double2> 
-            { 
-                typedef double elem_t; 
-                enum {cn=2}; 
-                static __device__ __forceinline__ __host__ double2 all(double v) {return make_double2(v, v);}
-                static __device__ __forceinline__ __host__ double2 make(double x, double y) {return make_double2(x, y);}
-            };
-            template<> struct VecTraits<double3> 
-            { 
-                typedef double elem_t; 
-                enum {cn=3}; 
-                static __device__ __forceinline__ __host__ double3 all(double v) {return make_double3(v, v, v);}
-                static __device__ __forceinline__ __host__ double3 make(double x, double y, double z) {return make_double3(x, y, z);}
-            };
-            template<> struct VecTraits<double4> 
-            { 
-                typedef double elem_t;
-                enum {cn=4}; 
-                static __device__ __forceinline__ __host__ double4 all(double v) {return make_double4(v, v, v, v);}
-                static __device__ __forceinline__ __host__ double4 make(double x, double y, double z, double w) {return make_double4(x, y, z, w);}
-            };
-
-            template <int cn, typename VecD> struct SatCast;
-            template <typename VecD> struct SatCast<1, VecD>
-            {
-                template <typename VecS>
-                static __device__ __forceinline__ VecD cast(const VecS& v)
-                {
-                    typedef typename VecTraits<VecD>::elem_t D;
-                    return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-                }
-            };
-            template <typename VecD> struct SatCast<2, VecD>
-            {
-                template <typename VecS>
-                static __device__ __forceinline__ VecD cast(const VecS& v)
-                {
-                    typedef typename VecTraits<VecD>::elem_t D;
-                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-                }
-            };
-            template <typename VecD> struct SatCast<3, VecD>
-            {
-                template <typename VecS>
-                static __device__ __forceinline__ VecD cast(const VecS& v)
-                {
-                    typedef typename VecTraits<VecD>::elem_t D;
-                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-                }
-            };
-            template <typename VecD> struct SatCast<4, VecD>
-            {
-                template <typename VecS>
-                static __device__ __forceinline__ VecD cast(const VecS& v)
-                {
-                    typedef typename VecTraits<VecD>::elem_t D;
-                    return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-                }
-            };
-
-            template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
-            {
-                return SatCast<VecTraits<VecD>::cn, VecD>::cast(v);
-            }
-
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return saturate_cast_caller<_Tp>(v);}
-
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return saturate_cast_caller<_Tp>(v);}
-
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return saturate_cast_caller<_Tp>(v);}
-
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return saturate_cast_caller<_Tp>(v);}
-            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return saturate_cast_caller<_Tp>(v);}
-
-            static __device__ __forceinline__  uchar1 operator+(const uchar1& a, const uchar1& b)
-            {
-                return make_uchar1(a.x + b.x);
-            }
-            static __device__ __forceinline__  uchar1 operator-(const uchar1& a, const uchar1& b)
-            {
-                return make_uchar1(a.x - b.x);
-            }
-            static __device__ __forceinline__  uchar1 operator*(const uchar1& a, const uchar1& b)
-            {
-                return make_uchar1(a.x * b.x);
-            }
-            static __device__ __forceinline__  uchar1 operator/(const uchar1& a, const uchar1& b)
-            {
-                return make_uchar1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const uchar1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  uchar2 operator+(const uchar2& a, const uchar2& b)
-            {
-                return make_uchar2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  uchar2 operator-(const uchar2& a, const uchar2& b)
-            {
-                return make_uchar2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  uchar2 operator*(const uchar2& a, const uchar2& b)
-            {
-                return make_uchar2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  uchar2 operator/(const uchar2& a, const uchar2& b)
-            {
-                return make_uchar2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const uchar2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  uchar3 operator+(const uchar3& a, const uchar3& b)
-            {
-                return make_uchar3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  uchar3 operator-(const uchar3& a, const uchar3& b)
-            {
-                return make_uchar3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  uchar3 operator*(const uchar3& a, const uchar3& b)
-            {
-                return make_uchar3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  uchar3 operator/(const uchar3& a, const uchar3& b)
-            {
-                return make_uchar3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const uchar3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  uchar4 operator+(const uchar4& a, const uchar4& b)
-            {
-                return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  uchar4 operator-(const uchar4& a, const uchar4& b)
-            {
-                return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  uchar4 operator*(const uchar4& a, const uchar4& b)
-            {
-                return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  uchar4 operator/(const uchar4& a, const uchar4& b)
-            {
-                return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const uchar4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-            static __device__ __forceinline__  char1 operator+(const char1& a, const char1& b)
-            {
-                return make_char1(a.x + b.x);
-            }
-            static __device__ __forceinline__  char1 operator-(const char1& a, const char1& b)
-            {
-                return make_char1(a.x - b.x);
-            }
-            static __device__ __forceinline__  char1 operator*(const char1& a, const char1& b)
-            {
-                return make_char1(a.x * b.x);
-            }
-            static __device__ __forceinline__  char1 operator/(const char1& a, const char1& b)
-            {
-                return make_char1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const char1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  char2 operator+(const char2& a, const char2& b)
-            {
-                return make_char2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  char2 operator-(const char2& a, const char2& b)
-            {
-                return make_char2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  char2 operator*(const char2& a, const char2& b)
-            {
-                return make_char2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  char2 operator/(const char2& a, const char2& b)
-            {
-                return make_char2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const char2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  char3 operator+(const char3& a, const char3& b)
-            {
-                return make_char3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  char3 operator-(const char3& a, const char3& b)
-            {
-                return make_char3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  char3 operator*(const char3& a, const char3& b)
-            {
-                return make_char3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  char3 operator/(const char3& a, const char3& b)
-            {
-                return make_char3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const char3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  char4 operator+(const char4& a, const char4& b)
-            {
-                return make_char4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  char4 operator-(const char4& a, const char4& b)
-            {
-                return make_char4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  char4 operator*(const char4& a, const char4& b)
-            {
-                return make_char4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  char4 operator/(const char4& a, const char4& b)
-            {
-                return make_char4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const char4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-            static __device__ __forceinline__  ushort1 operator+(const ushort1& a, const ushort1& b)
-            {
-                return make_ushort1(a.x + b.x);
-            }
-            static __device__ __forceinline__  ushort1 operator-(const ushort1& a, const ushort1& b)
-            {
-                return make_ushort1(a.x - b.x);
-            }
-            static __device__ __forceinline__  ushort1 operator*(const ushort1& a, const ushort1& b)
-            {
-                return make_ushort1(a.x * b.x);
-            }
-            static __device__ __forceinline__  ushort1 operator/(const ushort1& a, const ushort1& b)
-            {
-                return make_ushort1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const ushort1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  ushort2 operator+(const ushort2& a, const ushort2& b)
-            {
-                return make_ushort2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  ushort2 operator-(const ushort2& a, const ushort2& b)
-            {
-                return make_ushort2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  ushort2 operator*(const ushort2& a, const ushort2& b)
-            {
-                return make_ushort2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  ushort2 operator/(const ushort2& a, const ushort2& b)
-            {
-                return make_ushort2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const ushort2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  ushort3 operator+(const ushort3& a, const ushort3& b)
-            {
-                return make_ushort3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  ushort3 operator-(const ushort3& a, const ushort3& b)
-            {
-                return make_ushort3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  ushort3 operator*(const ushort3& a, const ushort3& b)
-            {
-                return make_ushort3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  ushort3 operator/(const ushort3& a, const ushort3& b)
-            {
-                return make_ushort3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const ushort3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  ushort4 operator+(const ushort4& a, const ushort4& b)
-            {
-                return make_ushort4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  ushort4 operator-(const ushort4& a, const ushort4& b)
-            {
-                return make_ushort4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  ushort4 operator*(const ushort4& a, const ushort4& b)
-            {
-                return make_ushort4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  ushort4 operator/(const ushort4& a, const ushort4& b)
-            {
-                return make_ushort4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const ushort4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-            static __device__ __forceinline__  short1 operator+(const short1& a, const short1& b)
-            {
-                return make_short1(a.x + b.x);
-            }
-            static __device__ __forceinline__  short1 operator-(const short1& a, const short1& b)
-            {
-                return make_short1(a.x - b.x);
-            }
-            static __device__ __forceinline__  short1 operator*(const short1& a, const short1& b)
-            {
-                return make_short1(a.x * b.x);
-            }
-            static __device__ __forceinline__  short1 operator/(const short1& a, const short1& b)
-            {
-                return make_short1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const short1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  short2 operator+(const short2& a, const short2& b)
-            {
-                return make_short2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  short2 operator-(const short2& a, const short2& b)
-            {
-                return make_short2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  short2 operator*(const short2& a, const short2& b)
-            {
-                return make_short2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  short2 operator/(const short2& a, const short2& b)
-            {
-                return make_short2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const short2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  short3 operator+(const short3& a, const short3& b)
-            {
-                return make_short3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  short3 operator-(const short3& a, const short3& b)
-            {
-                return make_short3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  short3 operator*(const short3& a, const short3& b)
-            {
-                return make_short3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  short3 operator/(const short3& a, const short3& b)
-            {
-                return make_short3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const short3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  short4 operator+(const short4& a, const short4& b)
-            {
-                return make_short4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  short4 operator-(const short4& a, const short4& b)
-            {
-                return make_short4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  short4 operator*(const short4& a, const short4& b)
-            {
-                return make_short4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  short4 operator/(const short4& a, const short4& b)
-            {
-                return make_short4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const short4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-            static __device__ __forceinline__  int1 operator+(const int1& a, const int1& b)
-            {
-                return make_int1(a.x + b.x);
-            }
-            static __device__ __forceinline__  int1 operator-(const int1& a, const int1& b)
-            {
-                return make_int1(a.x - b.x);
-            }
-            static __device__ __forceinline__  int1 operator*(const int1& a, const int1& b)
-            {
-                return make_int1(a.x * b.x);
-            }
-            static __device__ __forceinline__  int1 operator/(const int1& a, const int1& b)
-            {
-                return make_int1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const int1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  int2 operator+(const int2& a, const int2& b)
-            {
-                return make_int2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  int2 operator-(const int2& a, const int2& b)
-            {
-                return make_int2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  int2 operator*(const int2& a, const int2& b)
-            {
-                return make_int2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  int2 operator/(const int2& a, const int2& b)
-            {
-                return make_int2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const int2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  int3 operator+(const int3& a, const int3& b)
-            {
-                return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  int3 operator-(const int3& a, const int3& b)
-            {
-                return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  int3 operator*(const int3& a, const int3& b)
-            {
-                return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  int3 operator/(const int3& a, const int3& b)
-            {
-                return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const int3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  int4 operator+(const int4& a, const int4& b)
-            {
-                return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  int4 operator-(const int4& a, const int4& b)
-            {
-                return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  int4 operator*(const int4& a, const int4& b)
-            {
-                return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  int4 operator/(const int4& a, const int4& b)
-            {
-                return make_int4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const int4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-
-            static __device__ __forceinline__  uint1 operator+(const uint1& a, const uint1& b)
-            {
-                return make_uint1(a.x + b.x);
-            }
-            static __device__ __forceinline__  uint1 operator-(const uint1& a, const uint1& b)
-            {
-                return make_uint1(a.x - b.x);
-            }
-            static __device__ __forceinline__  uint1 operator*(const uint1& a, const uint1& b)
-            {
-                return make_uint1(a.x * b.x);
-            }
-            static __device__ __forceinline__  uint1 operator/(const uint1& a, const uint1& b)
-            {
-                return make_uint1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const uint1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  uint2 operator+(const uint2& a, const uint2& b)
-            {
-                return make_uint2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  uint2 operator-(const uint2& a, const uint2& b)
-            {
-                return make_uint2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  uint2 operator*(const uint2& a, const uint2& b)
-            {
-                return make_uint2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  uint2 operator/(const uint2& a, const uint2& b)
-            {
-                return make_uint2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const uint2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  uint3 operator+(const uint3& a, const uint3& b)
-            {
-                return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  uint3 operator-(const uint3& a, const uint3& b)
-            {
-                return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  uint3 operator*(const uint3& a, const uint3& b)
-            {
-                return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  uint3 operator/(const uint3& a, const uint3& b)
-            {
-                return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const uint3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  uint4 operator+(const uint4& a, const uint4& b)
-            {
-                return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  uint4 operator-(const uint4& a, const uint4& b)
-            {
-                return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  uint4 operator*(const uint4& a, const uint4& b)
-            {
-                return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  uint4 operator/(const uint4& a, const uint4& b)
-            {
-                return make_uint4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const uint4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-
-            static __device__ __forceinline__  float1 operator+(const float1& a, const float1& b)
-            {
-                return make_float1(a.x + b.x);
-            }
-            static __device__ __forceinline__  float1 operator-(const float1& a, const float1& b)
-            {
-                return make_float1(a.x - b.x);
-            }
-            static __device__ __forceinline__  float1 operator*(const float1& a, const float1& b)
-            {
-                return make_float1(a.x * b.x);
-            }
-            static __device__ __forceinline__  float1 operator/(const float1& a, const float1& b)
-            {
-                return make_float1(a.x / b.x);
-            }
-            static __device__ __forceinline__ float1 operator*(const float1& a, float s)
-            {
-                return make_float1(a.x * s);
-            }
-
-            static __device__ __forceinline__  float2 operator+(const float2& a, const float2& b)
-            {
-                return make_float2(a.x + b.x, a.y + b.y);
-            }
-            static __device__ __forceinline__  float2 operator-(const float2& a, const float2& b)
-            {
-                return make_float2(a.x - b.x, a.y - b.y);
-            }
-            static __device__ __forceinline__  float2 operator*(const float2& a, const float2& b)
-            {
-                return make_float2(a.x * b.x, a.y * b.y);
-            }
-            static __device__ __forceinline__  float2 operator/(const float2& a, const float2& b)
-            {
-                return make_float2(a.x / b.x, a.y / b.y);
-            }
-            static __device__ __forceinline__ float2 operator*(const float2& a, float s)
-            {
-                return make_float2(a.x * s, a.y * s);
-            }
-
-            static __device__ __forceinline__  float3 operator+(const float3& a, const float3& b)
-            {
-                return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-            }
-            static __device__ __forceinline__  float3 operator-(const float3& a, const float3& b)
-            {
-                return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-            }
-            static __device__ __forceinline__  float3 operator*(const float3& a, const float3& b)
-            {
-                return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-            }
-            static __device__ __forceinline__  float3 operator/(const float3& a, const float3& b)
-            {
-                return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-            }
-            static __device__ __forceinline__ float3 operator*(const float3& a, float s)
-            {
-                return make_float3(a.x * s, a.y * s, a.z * s);
-            }
-
-            static __device__ __forceinline__  float4 operator+(const float4& a, const float4& b)
-            {
-                return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-            }
-            static __device__ __forceinline__  float4 operator-(const float4& a, const float4& b)
-            {
-                return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-            }
-            static __device__ __forceinline__  float4 operator*(const float4& a, const float4& b)
-            {
-                return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-            }
-            static __device__ __forceinline__  float4 operator/(const float4& a, const float4& b)
-            {
-                return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-            }
-            static __device__ __forceinline__ float4 operator*(const float4& a, float s)
-            {
-                return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
-            }
-        }            
-    }
-}
-
-#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
diff --git a/modules/gpu/src/surf.cpp b/modules/gpu/src/surf.cpp
index 2ec6b2992b03058ce00fe283de93b43a83f08c0f..5ebe73919a03f433f3ab982d8472803bc360782a 100644
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -95,9 +95,7 @@ namespace
 
             img_cols(img.cols), img_rows(img.rows),
 
-            use_mask(!mask.empty()),
-
-            upright(surf.upright)
+            use_mask(!mask.empty())
         {
             CV_Assert(!img.empty() && img.type() == CV_8UC1);
             CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@@ -224,8 +222,6 @@ namespace
 
         bool use_mask;
 
-        bool upright;
-
         int maxCandidates;
         int maxFeatures;
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index eb762c1d7c3fa9a0b82f97030b6f6f63daf632bd..a101191ca7d7033990860fbe92cbecb57e8310c5 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -869,7 +869,7 @@ TEST(GaussianBlur)
 
 TEST(pyrDown)
 {
-    gpu::PyrDownBuf buf;
+    gpu::PyrDownBuf buf(Size(4000, 4000), CV_16SC3);
 
     for (int size = 4000; size >= 1000; size -= 1000)
     {
@@ -893,7 +893,7 @@ TEST(pyrDown)
 
 TEST(pyrUp)
 {
-    gpu::PyrUpBuf buf;
+    gpu::PyrUpBuf buf(Size(4000, 4000), CV_16SC3);
 
     for (int size = 4000; size >= 1000; size -= 1000)
     {
@@ -914,3 +914,26 @@ TEST(pyrUp)
         GPU_OFF;
     }
 }
+
+
+TEST(equalizeHist)
+{
+    for (int size = 1000; size < 4000; size += 1000)
+    {
+        SUBTEST << "size " << size;
+
+        Mat src; gen(src, size, size, CV_8UC1, 0, 256);
+        Mat dst(src.size(), src.type());
+
+        CPU_ON;
+        equalizeHist(src, dst);
+        CPU_OFF;
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(src.size(), src.type());
+
+        GPU_ON;
+        gpu::equalizeHist(d_src, d_dst);
+        GPU_OFF;
+    }
+}