moved GpuMat and DevMem2D to core module, some code refactoring

fcfa7208 · Vladislav Vinogradov · 8a148e39 · fcfa7208 · fcfa7208 · fcfa7208
93 changed file
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -90,6 +90,10 @@ class Mat;
 class SparseMat;
 typedef Mat MatND;

+namespace gpu {
+    class GpuMat;
+}
+
 class CV_EXPORTS MatExpr;
 class CV_EXPORTS MatOp_Base;
 class CV_EXPORTS MatArg;
@@ -1627,6 +1631,10 @@ public:
    template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
    //! builds matrix from comma initializer
    template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    //! download data from GpuMat
+    explicit Mat(const gpu::GpuMat& m);
+
    //! destructor - calls release()
    ~Mat();
    //! assignment operators

--- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
@@ -40,103 +40,118 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
-#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
+#ifndef __OPENCV_CORE_DevMem2D_HPP__
+#define __OPENCV_CORE_DevMem2D_HPP__
+
+#ifdef __CUDACC__ 
+    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
+#else
+    #define __CV_GPU_HOST_DEVICE__
+#endif

 namespace cv
 {    
+    namespace gpu
+    {
+        // Simple lightweight structures that encapsulates information about an image on device.
+        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

-namespace gpu
-{
-///////////////////////////////////////////////////////////////////////
-//////////////////////////////// CudaMem ////////////////////////////////
-///////////////////////////////////////////////////////////////////////
+        template <bool expr> struct StaticAssert;
+        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        

-inline CudaMem::CudaMem()  : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {}
-inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _rows > 0 && _cols > 0 )
-        create( _rows, _cols, _type, _alloc_type);
-}
+		template<typename T> struct DevPtr
+		{
+			typedef T elem_type;
+			typedef int index_type;

-inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _size.height > 0 && _size.width > 0 )
-        create( _size.height, _size.width, _type, _alloc_type);
-}
+			enum { elem_size = sizeof(elem_type) };

-inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
-{
-    if( refcount )
-        CV_XADD(refcount, 1);
-}
+			T* data;

-inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( m.rows > 0 && m.cols > 0 )
-        create( m.size(), m.type(), _alloc_type);
+			__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
+			__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}

-    Mat tmp = createMatHeader();
-    m.copyTo(tmp);
-}
+			__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
+			__CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }
+			__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
+		};
 		
-inline CudaMem::~CudaMem()
-{
-    release();
+		template<typename T> struct PtrSz : public DevPtr<T>
+        {                     
+            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
+            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}

-}
+            size_t size;
+        };

-inline CudaMem& CudaMem::operator = (const CudaMem& m)
-{
-    if( this != &m )
+		template<typename T> struct PtrStep : public DevPtr<T>
        {   
-        if( m.refcount )
-            CV_XADD(m.refcount, 1);
-        release();
-        flags = m.flags;
-        rows = m.rows; cols = m.cols;
-        step = m.step; data = m.data;
-        datastart = m.datastart;
-        dataend = m.dataend;
-        refcount = m.refcount;
-        alloc_type = m.alloc_type;
-    }
-    return *this;
-}
+            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
+			__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

-inline CudaMem CudaMem::clone() const
-{
-    CudaMem m(size(), type(), alloc_type);
-    Mat to = m;
-    Mat from = *this;
-    from.copyTo(to);
-    return m;
-}
+            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+            size_t step;            

-inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); }
+			__CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }

+			__CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+        };

-//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type);
-//CPP void CudaMem::release();
+		template <typename T> struct PtrStepSz : public PtrStep<T>
+        {   
+            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) 
+                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}

-inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data, step); }
-inline CudaMem::operator Mat() const { return createMatHeader(); }
+            int cols;
+            int rows;                                                                              
+        };

-inline CudaMem::operator GpuMat() const { return createGpuMatHeader(); }
-//CPP GpuMat CudaMem::createGpuMatHeader() const;
+		template <typename T> struct DevMem2D_ : public PtrStepSz<T>
+        {            
+            DevMem2D_() {}
+			DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
                            
-inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
-inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); }
-inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); }
-inline int CudaMem::type() const { return CV_MAT_TYPE(flags); }
-inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); }
-inline int CudaMem::channels() const { return CV_MAT_CN(flags); }
-inline size_t CudaMem::step1() const { return step/elemSize1(); }
-inline Size CudaMem::size() const { return Size(cols, rows); }
-inline bool CudaMem::empty() const { return data == 0; }
+            template <typename U>            
+			explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                
+        };
 		               
-} /* end of namespace gpu */
+        template<typename T> struct PtrElemStep_ : public PtrStep<T>
+        {                   
+            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) 
+            {
+                StaticAssert<256 % sizeof(T) == 0>::check();
+
+                PtrStep<T>::step /= PtrStep<T>::elem_size;             
+            }
+            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  
+
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
+        };

-} /* end of namespace cv */
+		template<typename T> struct PtrStep_ : public PtrStep<T>
+        {            
+            PtrStep_() {}
+            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        
+        };
+
+        typedef DevMem2D_<unsigned char> DevMem2Db;
+		typedef DevMem2Db DevMem2D;
+        typedef DevMem2D_<float> DevMem2Df;
+        typedef DevMem2D_<int> DevMem2Di;
+
+        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<float> PtrStepf;
+        typedef PtrStep<int> PtrStepi;
+
+        typedef PtrElemStep_<unsigned char> PtrElemStep;
+        typedef PtrElemStep_<float> PtrElemStepf;
+        typedef PtrElemStep_<int> PtrElemStepi;		
+    }    
+}

-#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */
+#endif /* __OPENCV_GPU_DevMem2D_HPP__ */
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPUMAT_HPP__
+#define __OPENCV_GPUMAT_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/devmem2d.hpp"
+
+namespace cv { namespace gpu
+{
+    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
+    class CV_EXPORTS GpuMat
+    {
+    public:
+        //! default constructor
+        GpuMat();
+
+        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+        GpuMat(int rows, int cols, int type);
+        GpuMat(Size size, int type);
+
+        //! constucts GpuMatrix and fills it with the specified value _s.
+        GpuMat(int rows, int cols, int type, Scalar s);
+        GpuMat(Size size, int type, Scalar s);
+
+        //! copy constructor
+        GpuMat(const GpuMat& m);
+
+        //! constructor for GpuMatrix headers pointing to user-allocated data
+        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+        //! creates a matrix header for a part of the bigger matrix
+        GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+        GpuMat(const GpuMat& m, Rect roi);
+        
+        //! builds GpuMat from Mat. Perfom blocking upload to device.
+        explicit GpuMat(const Mat& m);
+
+        //! destructor - calls release()
+        ~GpuMat();
+
+        //! assignment operators
+        GpuMat& operator = (const GpuMat& m);
+        
+        //! pefroms blocking upload data to GpuMat.
+        void upload(const Mat& m);
+
+        //! downloads data from device to host memory. Blocking calls.
+        void download(Mat& m) const;
+
+        //! returns a new GpuMatrix header for the specified row
+        GpuMat row(int y) const;
+        //! returns a new GpuMatrix header for the specified column
+        GpuMat col(int x) const;
+        //! ... for the specified row span
+        GpuMat rowRange(int startrow, int endrow) const;
+        GpuMat rowRange(Range r) const;
+        //! ... for the specified column span
+        GpuMat colRange(int startcol, int endcol) const;
+        GpuMat colRange(Range r) const;
+
+        //! returns deep copy of the GpuMatrix, i.e. the data is copied
+        GpuMat clone() const;
+        //! copies the GpuMatrix content to "m".
+        // It calls m.create(this->size(), this->type()).
+        void copyTo(GpuMat& m) const;
+        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
+        void copyTo(GpuMat& m, const GpuMat& mask) const;
+        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
+        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
+
+        void assignTo(GpuMat& m, int type=-1) const;
+
+        //! sets every GpuMatrix element to s
+        GpuMat& operator = (Scalar s);
+        //! sets some of the GpuMatrix elements to s, according to the mask
+        GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat());
+        //! creates alternative GpuMatrix header for the same data, with different
+        // number of channels and/or different number of rows. see cvReshape.
+        GpuMat reshape(int cn, int rows = 0) const;
+
+        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
+        // previous data is unreferenced if needed.
+        void create(int rows, int cols, int type);
+        void create(Size size, int type);
+        //! decreases reference counter;
+        // deallocate the data when reference counter reaches 0.
+        void release();
+
+        //! swaps with other smart pointer
+        void swap(GpuMat& mat);
+
+        //! locates GpuMatrix header within a parent GpuMatrix. See below
+        void locateROI(Size& wholeSize, Point& ofs) const;
+        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
+        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+        //! extracts a rectangular sub-GpuMatrix
+        // (this is a generalized form of row, rowRange etc.)
+        GpuMat operator()(Range rowRange, Range colRange) const;
+        GpuMat operator()(Rect roi) const;
+
+        //! returns true iff the GpuMatrix data is continuous
+        // (i.e. when there are no gaps between successive rows).
+        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
+        bool isContinuous() const;
+        //! returns element size in bytes,
+        // similar to CV_ELEM_SIZE(cvMat->type)
+        size_t elemSize() const;
+        //! returns the size of element channel in bytes.
+        size_t elemSize1() const;
+        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
+        int type() const;
+        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
+        int depth() const;
+        //! returns element type, similar to CV_MAT_CN(cvMat->type)
+        int channels() const;
+        //! returns step/elemSize1()
+        size_t step1() const;
+        //! returns GpuMatrix size:
+        // width == number of columns, height == number of rows
+        Size size() const;
+        //! returns true if GpuMatrix data is NULL
+        bool empty() const;
+
+        //! returns pointer to y-th row
+        uchar* ptr(int y = 0);
+        const uchar* ptr(int y = 0) const;
+
+        //! template version of the above method
+        template<typename _Tp> _Tp* ptr(int y = 0);
+        template<typename _Tp> const _Tp* ptr(int y = 0) const;
+
+        template <typename _Tp> operator DevMem2D_<_Tp>() const;
+        template <typename _Tp> operator PtrStep_<_Tp>() const;
+
+        /*! includes several bit-fields:
+        - the magic signature
+        - continuity flag
+        - depth
+        - number of channels
+        */
+        int flags;
+
+        //! the number of rows and columns
+        int rows, cols;
+
+        //! a distance between successive rows in bytes; includes the gap if any
+        size_t step;
+
+        //! pointer to the data
+        uchar* data;
+
+        //! pointer to the reference counter;
+        // when GpuMatrix points to user-allocated data, the pointer is NULL
+        int* refcount;
+
+        //! helper fields used in locateROI and adjustROI
+        uchar* datastart;
+        uchar* dataend;
+    };
+
+    //! Creates continuous GPU matrix
+    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
+    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
+    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
+    CV_EXPORTS GpuMat createContinuous(Size size, int type);
+
+    //! Ensures that size of the given matrix is not less than (rows, cols) size
+    //! and matrix type is match specified one too
+    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
+    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
+
+    class CV_EXPORTS GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
+        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+
+        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+
+        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
+
+        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
+
+        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+        virtual void free(void* devPtr) const = 0;
+    };
+
+    CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl);
+
+    ////////////////////////////////////////////////////////////////////////
+
+    inline GpuMat::GpuMat() 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
+    {
+    }
+
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (rows_ > 0 && cols_ > 0)
+            create(rows_, cols_, type_);
+    }
+
+    inline GpuMat::GpuMat(Size size_, int type_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (size_.height > 0 && size_.width > 0)
+            create(size_.height, size_.width, type_);
+    }
+
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (rows_ > 0 && cols_ > 0)
+        {
+            create(rows_, cols_, type_);
+            setTo(s_);
+        }
+    }
+
+    inline GpuMat::GpuMat(Size size_, int type_, Scalar s_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (size_.height > 0 && size_.width > 0)
+        {
+            create(size_.height, size_.width, type_);
+            setTo(s_);
+        }
+    }    
+
+    inline GpuMat::~GpuMat() 
+    { 
+        release(); 
+    }
+
+    inline GpuMat GpuMat::clone() const
+    {
+        GpuMat m;
+        copyTo(m);
+        return m;
+    }
+
+    inline void GpuMat::assignTo(GpuMat& m, int type) const
+    {
+        if (type < 0)
+            m = *this;
+        else
+            convertTo(m, type);
+    }
+
+    inline size_t GpuMat::step1() const 
+    { 
+        return step / elemSize1(); 
+    }
+
+    inline bool GpuMat::empty() const 
+    { 
+        return data == 0; 
+    }
+
+    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
+    {
+        return (_Tp*)ptr(y);
+    }
+
+    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
+    {
+        return (const _Tp*)ptr(y);
+    }
+
+    inline void swap(GpuMat& a, GpuMat& b) 
+    { 
+        a.swap(b); 
+    }
+
+    inline GpuMat GpuMat::row(int y) const 
+    { 
+        return GpuMat(*this, Range(y, y+1), Range::all()); 
+    }
+
+    inline GpuMat GpuMat::col(int x) const 
+    { 
+        return GpuMat(*this, Range::all(), Range(x, x+1)); 
+    }
+
+    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const 
+    { 
+        return GpuMat(*this, Range(startrow, endrow), Range::all()); 
+    }
+
+    inline GpuMat GpuMat::rowRange(Range r) const 
+    { 
+        return GpuMat(*this, r, Range::all()); 
+    }
+
+    inline GpuMat GpuMat::colRange(int startcol, int endcol) const 
+    { 
+        return GpuMat(*this, Range::all(), Range(startcol, endcol)); 
+    }
+
+    inline GpuMat GpuMat::colRange(Range r) const 
+    { 
+        return GpuMat(*this, Range::all(), r); 
+    }
+
+    inline void GpuMat::create(Size size_, int type_) 
+    { 
+        create(size_.height, size_.width, type_); 
+    }
+
+    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const 
+    { 
+        return GpuMat(*this, rowRange, colRange); 
+    }
+
+    inline GpuMat GpuMat::operator()(Rect roi) const 
+    { 
+        return GpuMat(*this, roi); 
+    }
+
+    inline bool GpuMat::isContinuous() const 
+    { 
+        return (flags & Mat::CONTINUOUS_FLAG) != 0; 
+    }
+
+    inline size_t GpuMat::elemSize() const 
+    { 
+        return CV_ELEM_SIZE(flags); 
+    }
+
+    inline size_t GpuMat::elemSize1() const 
+    { 
+        return CV_ELEM_SIZE1(flags); 
+    }
+
+    inline int GpuMat::type() const 
+    { 
+        return CV_MAT_TYPE(flags); 
+    }
+
+    inline int GpuMat::depth() const 
+    { 
+        return CV_MAT_DEPTH(flags); 
+    }
+
+    inline int GpuMat::channels() const 
+    { 
+        return CV_MAT_CN(flags); 
+    }
+
+    inline Size GpuMat::size() const 
+    { 
+        return Size(cols, rows); 
+    }
+
+    inline uchar* GpuMat::ptr(int y)
+    {
+        CV_DbgAssert((unsigned)y < (unsigned)rows);
+        return data + step * y;
+    }
+
+    inline const uchar* GpuMat::ptr(int y) const
+    {
+        CV_DbgAssert((unsigned)y < (unsigned)rows);
+        return data + step * y;
+    }
+
+    inline GpuMat& GpuMat::operator = (Scalar s)
+    {
+        setTo(s);
+        return *this;
+    }
+
+    template <class T> inline GpuMat::operator DevMem2D_<T>() const 
+    { 
+        return DevMem2D_<T>(rows, cols, (T*)data, step); 
+    }
+
+    template <class T> inline GpuMat::operator PtrStep_<T>() const 
+    { 
+        return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); 
+    }
+
+    inline GpuMat createContinuous(int rows, int cols, int type)
+    {
+        GpuMat m;
+        createContinuous(rows, cols, type, m);
+        return m;
+    }
+
+    inline void createContinuous(Size size, int type, GpuMat& m)
+    {
+        createContinuous(size.height, size.width, type, m);
+    }
+
+    inline GpuMat createContinuous(Size size, int type)
+    {
+        GpuMat m;
+        createContinuous(size, type, m);
+        return m;
+    }
+
+    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
+    {
+        ensureSizeIsEnough(size.height, size.width, type, m);
+    }
+
+    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
+    {
+        int area = rows * cols;
+        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
+            m.create(1, area, type);
+        m = m.reshape(0, rows);
+    }
+
+    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
+    {
+        if (m.type() == type && m.rows >= rows && m.cols >= cols)
+            m = m(Rect(0, 0, cols, rows));
+        else
+            m.create(rows, cols, type);
+    }
+}}
+
+#endif // __OPENCV_GPUMAT_HPP__
--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,8 @@ set(name "gpu")
 set(the_target "opencv_${name}")
 project(${the_target})

-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_calib3d" "opencv_objdetect")
+set(DEPS_HEADER ${DEPS} "opencv_features2d" "opencv_flann")
 set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)

 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
@@ -27,6 +28,13 @@ file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})

+foreach(d ${DEPS_HEADER})
+	if(${d} MATCHES "opencv_")
+		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
+		include_directories("${d_dir}/include")
+	endif()
+endforeach()
+
 if (HAVE_CUDA)
    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
    file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
@@ -51,7 +59,6 @@ if (HAVE_CUDA)
        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
    endif()

-
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
@@ -60,7 +67,7 @@ if (HAVE_CUDA)
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408 /wd4251")

        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
@@ -74,17 +81,14 @@ if (HAVE_CUDA)
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
    endif()
    
+    if(MSVC)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/wd4251")
+    endif()
+
    CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
    #CUDA_BUILD_CLEAN_TARGET()
 endif()

-foreach(d ${DEPS})
-	if(${d} MATCHES "opencv_")
-		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-		include_directories("${d_dir}/include")
-	endif()
-endforeach()
-
 add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})

 # For dynamic link numbering convenions

--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -40,122 +40,4 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DevMem2D_HPP__
-#define __OPENCV_GPU_DevMem2D_HPP__
-
-
-namespace cv
-{    
-    namespace gpu
-    {
-        // Simple lightweight structures that encapsulates information about an image on device.
-        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
-
-#if defined(__CUDACC__) 
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
-#else
-    #define __CV_GPU_HOST_DEVICE__
-#endif
-
-        template <bool expr> struct StaticAssert;
-        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        
-
-		template<typename T> struct DevPtr
-		{
-			typedef T elem_type;
-			typedef int index_type;
-
-			enum { elem_size = sizeof(elem_type) };
-
-			T* data;
-
-			__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
-			__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
-
-			__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
-			__CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }
-			__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
-		};
-		
-		template<typename T> struct PtrSz : public DevPtr<T>
-        {                     
-            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
-
-            size_t size;
-        };
-
-		template<typename T> struct PtrStep : public DevPtr<T>
-        {   
-            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
-			__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
-
-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
-            size_t step;            
-
-			__CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
-
-			__CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
-        };
-
-		template <typename T> struct PtrStepSz : public PtrStep<T>
-        {   
-            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) 
-                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
-
-            int cols;
-            int rows;                                                                              
-        };
-
-		template <typename T> struct DevMem2D_ : public PtrStepSz<T>
-        {            
-            DevMem2D_() {}
-			DevMem2D_(int rows_, int cols_, T *data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
-                            
-            template <typename U>            
-			explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                
-        };
-		               
-        template<typename T> struct PtrElemStep_ : public PtrStep<T>
-        {                   
-            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) 
-            {
-                StaticAssert<256 % sizeof(T) == 0>::check();
-
-                PtrStep<T>::step /= PtrStep<T>::elem_size;             
-            }
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  
-
-            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
-        };
-
-		template<typename T> struct PtrStep_ : public PtrStep<T>
-        {            
-            PtrStep_() {}
-            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        
-        };
-
-#undef __CV_GPU_HOST_DEVICE__
-
-
-        typedef DevMem2D_<unsigned char> DevMem2Db;
-		typedef DevMem2Db DevMem2D;
-        typedef DevMem2D_<float> DevMem2Df;
-        typedef DevMem2D_<int> DevMem2Di;
-
-        typedef PtrStep<unsigned char> PtrStepb;
-        typedef PtrStep<float> PtrStepf;
-        typedef PtrStep<int> PtrStepi;
-
-        typedef PtrElemStep_<unsigned char> PtrElemStep;
-        typedef PtrElemStep_<float> PtrElemStepf;
-        typedef PtrElemStep_<int> PtrElemStepi;		
-    }    
-}
-
-#endif /* __OPENCV_GPU_DevMem2D_HPP__ */
+#include "opencv2/core/devmem2d.hpp"
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
--- a/modules/gpu/include/opencv2/gpu/gpumat.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp
@@ -40,427 +40,4 @@
 //
 //M*/

-#ifndef __OPENCV_GPUMAT_HPP__
-#define __OPENCV_GPUMAT_HPP__
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-
-namespace cv { namespace gpu
-{
-    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
-    class CV_EXPORTS GpuMat
-    {
-    public:
-        //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
-        // Contains just image size, data ptr and step.
-        template <class T> operator DevMem2D_<T>() const;
-        template <class T> operator PtrStep_<T>() const;
-		template <class T> operator PtrStep<T>() const;
-
-
-
-
-
-        //! builds GpuMat from Mat. Perfom blocking upload to device.
-        explicit GpuMat(const Mat& m);
-
-        //! pefroms blocking upload data to GpuMat.
-        void upload(const Mat& m);
-
-        //! downloads data from device to host memory. Blocking calls.
-        void download(Mat& m) const;
-        operator Mat() const
-        {
-            Mat m;
-            download(m);
-            return m;
-        }
-
-
-
-
-
-
-        //! default constructor
-        GpuMat();
-
-        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-        GpuMat(int rows, int cols, int type);
-        GpuMat(Size size, int type);
-
-        //! constucts GpuMatrix and fills it with the specified value _s.
-        GpuMat(int rows, int cols, int type, const Scalar& s);
-        GpuMat(Size size, int type, const Scalar& s);
-
-        //! copy constructor
-        GpuMat(const GpuMat& m);
-
-        //! constructor for GpuMatrix headers pointing to user-allocated data
-        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
-        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
-
-        //! creates a matrix header for a part of the bigger matrix
-        GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);
-        GpuMat(const GpuMat& m, const Rect& roi);
-
-        //! destructor - calls release()
-        ~GpuMat();
-
-        //! assignment operators
-        GpuMat& operator = (const GpuMat& m);
-
-        //! returns a new GpuMatrix header for the specified row
-        GpuMat row(int y) const;
-        //! returns a new GpuMatrix header for the specified column
-        GpuMat col(int x) const;
-        //! ... for the specified row span
-        GpuMat rowRange(int startrow, int endrow) const;
-        GpuMat rowRange(const Range& r) const;
-        //! ... for the specified column span
-        GpuMat colRange(int startcol, int endcol) const;
-        GpuMat colRange(const Range& r) const;
-
-        //! returns deep copy of the GpuMatrix, i.e. the data is copied
-        GpuMat clone() const;
-        //! copies the GpuMatrix content to "m".
-        // It calls m.create(this->size(), this->type()).
-        void copyTo(GpuMat& m) const;
-        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
-        void copyTo(GpuMat& m, const GpuMat& mask) const;
-        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
-        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
-
-        void assignTo(GpuMat& m, int type=-1) const;
-
-        //! sets every GpuMatrix element to s
-        GpuMat& operator = (const Scalar& s);
-        //! sets some of the GpuMatrix elements to s, according to the mask
-        GpuMat& setTo(const Scalar& s, const GpuMat& mask = GpuMat());
-        //! creates alternative GpuMatrix header for the same data, with different
-        // number of channels and/or different number of rows. see cvReshape.
-        GpuMat reshape(int cn, int rows = 0) const;
-
-        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
-        // previous data is unreferenced if needed.
-        void create(int rows, int cols, int type);
-        void create(Size size, int type);
-        //! decreases reference counter;
-        // deallocate the data when reference counter reaches 0.
-        void release();
-
-        //! swaps with other smart pointer
-        void swap(GpuMat& mat);
-
-        //! locates GpuMatrix header within a parent GpuMatrix. See below
-        void locateROI(Size& wholeSize, Point& ofs) const;
-        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
-        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
-        //! extracts a rectangular sub-GpuMatrix
-        // (this is a generalized form of row, rowRange etc.)
-        GpuMat operator()(Range rowRange, Range colRange) const;
-        GpuMat operator()(const Rect& roi) const;
-
-        //! returns true iff the GpuMatrix data is continuous
-        // (i.e. when there are no gaps between successive rows).
-        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
-        bool isContinuous() const;
-        //! returns element size in bytes,
-        // similar to CV_ELEM_SIZE(cvMat->type)
-        size_t elemSize() const;
-        //! returns the size of element channel in bytes.
-        size_t elemSize1() const;
-        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
-        int type() const;
-        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
-        int depth() const;
-        //! returns element type, similar to CV_MAT_CN(cvMat->type)
-        int channels() const;
-        //! returns step/elemSize1()
-        size_t step1() const;
-        //! returns GpuMatrix size:
-        // width == number of columns, height == number of rows
-        Size size() const;
-        //! returns true if GpuMatrix data is NULL
-        bool empty() const;
-
-        //! returns pointer to y-th row
-        uchar* ptr(int y = 0);
-        const uchar* ptr(int y = 0) const;
-
-        //! template version of the above method
-        template<typename _Tp> _Tp* ptr(int y = 0);
-        template<typename _Tp> const _Tp* ptr(int y = 0) const;
-
-        /*! includes several bit-fields:
-        - the magic signature
-        - continuity flag
-        - depth
-        - number of channels
-        */
-        int flags;
-
-        //! the number of rows and columns
-        int rows, cols;
-
-        //! a distance between successive rows in bytes; includes the gap if any
-        size_t step;
-
-        //! pointer to the data
-        uchar* data;
-
-        //! pointer to the reference counter;
-        // when GpuMatrix points to user-allocated data, the pointer is NULL
-        int* refcount;
-
-        //! helper fields used in locateROI and adjustROI
-        uchar* datastart;
-        uchar* dataend;
-    };
-
-    //! Creates continuous GPU matrix
-    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
-    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
-    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
-    CV_EXPORTS GpuMat createContinuous(Size size, int type);
-
-    //! Ensures that size of the given matrix is not less than (rows, cols) size
-    //! and matrix type is match specified one too
-    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
-    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
-
-    ////////////////////////////////////////////////////////////////////////
-
-    template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }
-    template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); }
-	template <class T> inline GpuMat::operator PtrStep<T>() const { return PtrStep<T>((T*)data, step); }	
-
-
-
-
-
-
-    inline GpuMat::GpuMat() 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
-    {
-    }
-
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (rows_ > 0 && cols_ > 0)
-            create(rows_, cols_, type_);
-    }
-
-    inline GpuMat::GpuMat(Size size_, int type_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (size_.height > 0 && size_.width > 0)
-            create(size_.height, size_.width, type_);
-    }
-
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_, const Scalar& s_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (rows_ > 0 && cols_ > 0)
-        {
-            create(rows_, cols_, type_);
-            setTo(s_);
-        }
-    }
-
-    inline GpuMat::GpuMat(Size size_, int type_, const Scalar& s_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (size_.height > 0 && size_.width > 0)
-        {
-            create(size_.height, size_.width, type_);
-            setTo(s_);
-        }
-    }
-
-    inline GpuMat::~GpuMat() 
-    { 
-        release(); 
-    }
-
-    inline GpuMat GpuMat::clone() const
-    {
-        GpuMat m;
-        copyTo(m);
-        return m;
-    }
-
-    inline void GpuMat::assignTo(GpuMat& m, int type) const
-    {
-        if (type < 0)
-            m = *this;
-        else
-            convertTo(m, type);
-    }
-
-    inline size_t GpuMat::step1() const 
-    { 
-        return step / elemSize1(); 
-    }
-
-    inline bool GpuMat::empty() const 
-    { 
-        return data == 0; 
-    }
-
-    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
-    {
-        return (_Tp*)ptr(y);
-    }
-
-    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
-    {
-        return (const _Tp*)ptr(y);
-    }
-
-    inline void swap(GpuMat& a, GpuMat& b) 
-    { 
-        a.swap(b); 
-    }
-
-    inline GpuMat GpuMat::row(int y) const 
-    { 
-        return GpuMat(*this, Range(y, y+1), Range::all()); 
-    }
-
-    inline GpuMat GpuMat::col(int x) const 
-    { 
-        return GpuMat(*this, Range::all(), Range(x, x+1)); 
-    }
-
-    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const 
-    { 
-        return GpuMat(*this, Range(startrow, endrow), Range::all()); 
-    }
-
-    inline GpuMat GpuMat::rowRange(const Range& r) const 
-    { 
-        return GpuMat(*this, r, Range::all()); 
-    }
-
-    inline GpuMat GpuMat::colRange(int startcol, int endcol) const 
-    { 
-        return GpuMat(*this, Range::all(), Range(startcol, endcol)); 
-    }
-
-    inline GpuMat GpuMat::colRange(const Range& r) const 
-    { 
-        return GpuMat(*this, Range::all(), r); 
-    }
-
-    inline void GpuMat::create(Size size_, int type_) 
-    { 
-        create(size_.height, size_.width, type_); 
-    }
-
-    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const 
-    { 
-        return GpuMat(*this, rowRange, colRange); 
-    }
-
-    inline GpuMat GpuMat::operator()(const Rect& roi) const 
-    { 
-        return GpuMat(*this, roi); 
-    }
-
-    inline bool GpuMat::isContinuous() const 
-    { 
-        return (flags & Mat::CONTINUOUS_FLAG) != 0; 
-    }
-
-    inline size_t GpuMat::elemSize() const 
-    { 
-        return CV_ELEM_SIZE(flags); 
-    }
-
-    inline size_t GpuMat::elemSize1() const 
-    { 
-        return CV_ELEM_SIZE1(flags); 
-    }
-
-    inline int GpuMat::type() const 
-    { 
-        return CV_MAT_TYPE(flags); 
-    }
-
-    inline int GpuMat::depth() const 
-    { 
-        return CV_MAT_DEPTH(flags); 
-    }
-
-    inline int GpuMat::channels() const 
-    { 
-        return CV_MAT_CN(flags); 
-    }
-
-    inline Size GpuMat::size() const 
-    { 
-        return Size(cols, rows); 
-    }
-
-    inline unsigned char* GpuMat::ptr(int y)
-    {
-        CV_DbgAssert((unsigned)y < (unsigned)rows);
-        return data + step * y;
-    }
-
-    inline const unsigned char* GpuMat::ptr(int y) const
-    {
-        CV_DbgAssert((unsigned)y < (unsigned)rows);
-        return data + step * y;
-    }
-
-    inline GpuMat& GpuMat::operator = (const Scalar& s)
-    {
-        setTo(s);
-        return *this;
-    }
-
-    inline GpuMat createContinuous(int rows, int cols, int type)
-    {
-        GpuMat m;
-        createContinuous(rows, cols, type, m);
-        return m;
-    }
-
-    inline void createContinuous(Size size, int type, GpuMat& m)
-    {
-        createContinuous(size.height, size.width, type, m);
-    }
-
-    inline GpuMat createContinuous(Size size, int type)
-    {
-        GpuMat m;
-        createContinuous(size, type, m);
-        return m;
-    }
-
-    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
-    {
-        ensureSizeIsEnough(size.height, size.width, type, m);
-    }
-
-    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
-    {
-        int area = rows * cols;
-        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
-            m.create(1, area, type);
-        m = m.reshape(0, rows);
-    }
-
-    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
-    {
-        if (m.type() == type && m.rows >= rows && m.cols >= cols)
-            m = m(Rect(0, 0, cols, rows));
-        else
-            m.create(rows, cols, type);
-    }
-}}
-
-#endif // __OPENCV_GPUMAT_HPP__
+#include "opencv2/core/gpumat.hpp"
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@@ -24,7 +24,7 @@ PERF_TEST_P(DevInfo_Size_MatType, transpose, testing::Combine(testing::ValuesIn(
        transpose(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -55,7 +55,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, flip, testing::Combine(testing::Value
        flip(src, dst, flipCode);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_Size_MatType, LUT, testing::Combine(testing::ValuesIn(device
        LUT(src, lut, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -115,8 +115,8 @@ PERF_TEST_P(DevInfo_Size, cartToPolar, testing::Combine(testing::ValuesIn(device
        cartToPolar(x, y, magnitude, angle);
    }

-    Mat magnitude_host = magnitude;
-    Mat angle_host = angle;
+    Mat magnitude_host(magnitude);
+    Mat angle_host(angle);

    SANITY_CHECK(magnitude_host);
    SANITY_CHECK(angle_host);
@@ -147,8 +147,8 @@ PERF_TEST_P(DevInfo_Size, polarToCart, testing::Combine(testing::ValuesIn(device
        polarToCart(magnitude, angle, x, y);
    }

-    Mat x_host = x;
-    Mat y_host = angle;
+    Mat x_host(x);
+    Mat y_host(y);

    SANITY_CHECK(x_host);
    SANITY_CHECK(y_host);
@@ -180,7 +180,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addMat, testing::Combine(testing::ValuesIn(dev
        add(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@@ -210,7 +210,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addScalar, testing::Combine(testing::ValuesIn(
        add(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@@ -241,7 +241,7 @@ PERF_TEST_P(DevInfo_Size_MatType, subtractMat, testing::Combine(testing::ValuesI
        subtract(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@@ -270,7 +270,7 @@ PERF_TEST_P(DevInfo_Size, multiplyMat, testing::Combine(testing::ValuesIn(device
        multiply(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@@ -300,7 +300,7 @@ PERF_TEST_P(DevInfo_Size_MatType, multiplyScalar, testing::Combine(testing::Valu
        multiply(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@@ -327,7 +327,7 @@ PERF_TEST_P(DevInfo_Size, exp, testing::Combine(testing::ValuesIn(devices()),
        exp(a, b);
    }

-    Mat b_host = b;
+    Mat b_host(b);

    SANITY_CHECK(b_host);
 }
@@ -356,7 +356,7 @@ PERF_TEST_P(DevInfo_Size_MatType, pow, testing::Combine(testing::ValuesIn(device
        pow(src, 2.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -389,7 +389,7 @@ PERF_TEST_P(DevInfo_Size_MatType_CmpOp, compare, testing::Combine(testing::Value
        compare(src1, src2, dst, cmpop);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -418,7 +418,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_not, testing::Combine(testing::ValuesI
        bitwise_not(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -449,7 +449,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_and, testing::Combine(testing::ValuesI
        bitwise_and(src1, src2, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -480,7 +480,7 @@ PERF_TEST_P(DevInfo_Size_MatType, min, testing::Combine(testing::ValuesIn(device
        min(src1, src2, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -712,7 +712,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI
        addWeighted(src1, 0.5, src2, 0.5, 0.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -743,7 +743,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val
        reduce(src, dst, dim, CV_REDUCE_MIN);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -774,7 +774,7 @@ PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()),
        gemm(src1, src2, 1.0, src3, 1.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -20,7 +20,7 @@ PERF_TEST_P(DevInfo, transformPoints, testing::ValuesIn(devices()))
        transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -45,7 +45,7 @@ PERF_TEST_P(DevInfo, projectPoints, testing::ValuesIn(devices()))
        projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }

--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -28,7 +28,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -96,7 +96,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -130,7 +130,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Com
        filter->apply(src, dst, Rect(0, 0, src.cols, src.rows));
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -36,7 +36,7 @@ PERF_TEST_P(DevInfo_Size_MatType_Interpolation_BorderMode, remap, testing::Combi
        remap(src, dst, xmap, ymap, interpolation, borderMode);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo, meanShiftFiltering, testing::ValuesIn(devices()))
        meanShiftFiltering(src, dst, 50, 50);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -91,8 +91,8 @@ PERF_TEST_P(DevInfo, meanShiftProc, testing::ValuesIn(devices()))
        meanShiftProc(src, dstr, dstsp, 50, 50);
    }

-    Mat dstr_host = dstr;
-    Mat dstsp_host = dstsp;
+    Mat dstr_host(dstr);
+    Mat dstsp_host(dstsp);

    SANITY_CHECK(dstr_host);
    SANITY_CHECK(dstsp_host);

--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -25,7 +25,7 @@ PERF_TEST_P(DevInfo_Size_MatType, merge, testing::Combine(testing::ValuesIn(devi
        merge(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -82,7 +82,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setTo, testing::Combine(testing::ValuesIn(devi
        src.setTo(val);
    }

-    Mat src_host = src;
+    Mat src_host(src);

    SANITY_CHECK(src_host);
 }
@@ -115,7 +115,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setToMasked, testing::Combine(testing::ValuesI
        src.setTo(val, mask);
    }

-    src_host = src;
+    src.download(src_host);

    SANITY_CHECK(src_host);
 }
@@ -148,7 +148,7 @@ PERF_TEST_P(DevInfo_Size_MatType, copyToMasked, testing::Combine(testing::Values
        src.copyTo(dst, mask);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@@ -182,7 +182,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MatType, convertTo, testing::Combine(testing::V
        src.convertTo(dst, type2, a, b);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -425,16 +425,22 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart

-namespace cv { namespace gpu { namespace mathfunc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace mathfunc 
 {
-    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream);
-    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
-}}}
+    void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+    void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+
        CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);

@@ -448,11 +454,13 @@ namespace
        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();

-        mathfunc::cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
+        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
    }

    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+
        CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);

@@ -464,34 +472,33 @@ namespace
        GpuMat x1cn = x.reshape(1);
        GpuMat y1cn = y.reshape(1);

-        mathfunc::polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
+        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
    }
 }

 void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
 }

 void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
 }

 void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
 }

 void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
 }

 void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
 {
-    ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
+    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }

-
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -55,13 +55,19 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace bf 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace bilateral_filter
 {
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc);
+    void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+    void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+    void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);
-}}}
+using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;

 namespace
 {
@@ -105,7 +111,7 @@ namespace
        short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
        short max_disc = short(ndisp * max_disc_threshold + 0.5);

-        bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+        load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);

        if (&dst != &disp)
        {
@@ -115,7 +121,7 @@ namespace
                disp.copyTo(dst);
        }

-        bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+        bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }

    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 

--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -52,15 +52,19 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu

 #else

-namespace cv { namespace gpu 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace blend
 {
    template <typename T>
-    void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, 
-                           const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream);
+    void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+    void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

-    void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, 
-                               const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream);
-}}
+using namespace OPENCV_DEVICE_NAMESPACE_ blend;

 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
                          GpuMat& result, Stream& stream)

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -82,7 +82,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace bf_match
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace bf_match
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
        const DevMem2Di& trainIdx, const DevMem2Df& distance, 
@@ -103,9 +105,9 @@ namespace cv { namespace gpu { namespace bf_match
    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
        int cc, cudaStream_t stream);
-}}}
+}

-namespace cv { namespace gpu { namespace bf_knnmatch
+namespace bf_knnmatch
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@@ -126,9 +128,9 @@ namespace cv { namespace gpu { namespace bf_knnmatch
    template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
        int cc, cudaStream_t stream);
-}}}
+}

-namespace cv { namespace gpu { namespace bf_radius_match 
+namespace bf_radius_match 
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@@ -151,15 +153,17 @@ namespace cv { namespace gpu { namespace bf_radius_match
    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
        int cc, cudaStream_t stream);
-}}}
-
-cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
-{
 }

+END_OPENCV_DEVICE_NAMESPACE
+
 ////////////////////////////////////////////////////////////////////
 // Train collection

+cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
+{
+}
+
 void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
 {
    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
@@ -195,7 +199,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
@@ -242,8 +246,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
    if (trainIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);

    matchConvert(trainIdxCPU, distanceCPU, matches);
 }
@@ -337,7 +341,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::bf_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
@@ -384,9 +388,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);

    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
 }
@@ -448,7 +452,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_knnmatch;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@@ -511,8 +515,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
    if (trainIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);

    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
 }
@@ -577,7 +581,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::bf_knnmatch;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
@@ -630,9 +634,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainI
    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);

    knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
 }
@@ -758,7 +762,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_radius_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@@ -819,9 +823,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
    if (trainIdx.empty() || distance.empty() || nMatches.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
-    Mat nMatchesCPU = nMatches;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);
+    Mat nMatchesCPU(nMatches);

    radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }
@@ -889,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    if (query.empty() || empty())
        return;

-    using namespace cv::gpu::bf_radius_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@@ -953,10 +957,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
-    Mat nMatchesCPU = nMatches;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);
+    Mat nMatchesCPU(nMatches);

    radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }

--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -42,6 +42,10 @@

 #include "precomp.hpp"

+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
 #if !defined(HAVE_CUDA)

 void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -52,13 +56,31 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat

 #else

-using namespace cv;
-using namespace cv::gpu;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace transform_points 
+namespace transform_points 
 {
    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
-}}}
+}
+
+namespace project_points 
+{
+    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
+}
+
+namespace solve_pnp_ransac
+{
+    int maxNumIters();
+
+    void computeHypothesisScores(
+            const int num_hypotheses, const int num_points, const float* rot_matrices,
+            const float3* transl_vectors, const float3* object, const float2* image,
+            const float dist_threshold, int* hypothesis_scores);
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE;

 namespace
 {
@@ -79,15 +101,9 @@ namespace

 void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
 {
-    ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace project_points 
-{
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
-}}}
-
-
 namespace
 {
    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
@@ -109,20 +125,9 @@ namespace

 void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
 {
-    ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
 }

-
-namespace cv { namespace gpu { namespace solve_pnp_ransac
-{
-    int maxNumIters();
-
-    void computeHypothesisScores(
-            const int num_hypotheses, const int num_points, const float* rot_matrices,
-            const float3* transl_vectors, const float3* object, const float2* image,
-            const float dist_threshold, int* hypothesis_scores);
-}}}
-
 namespace
 {
    // Selects subset_size random different points from [0, num_points - 1] range

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -46,7 +46,6 @@ using namespace cv;
 using namespace cv::gpu;
 using namespace std;

-
 #if !defined (HAVE_CUDA)

 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -51,155 +51,158 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu {  namespace device  
-{
-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);

-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)

-    #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
-
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-}}}
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
+
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE;

 namespace
 {

--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,65 +43,58 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace bf_krnls
-{
-    __constant__ float* ctable_color;
-    __constant__ float* ctable_space;
-    __constant__ size_t ctable_space_step;
+namespace bilateral_filter {

-    __constant__ int cndisp;
-    __constant__ int cradius;
+__constant__ float* ctable_color;
+__constant__ float* ctable_space;
+__constant__ size_t ctable_space_step;

-    __constant__ short cedge_disc;
-    __constant__ short cmax_disc;
-}
+__constant__ int cndisp;
+__constant__ int cradius;

-namespace cv { namespace gpu { namespace bf 
+__constant__ short cedge_disc;
+__constant__ short cmax_disc;
+
+void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
 {
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
    size_t table_space_step = table_space.step / sizeof(float);
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );

-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );

-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
-    }
-}}}
+    cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
+}

-namespace bf_krnls
+template <int channels>
+struct DistRgbMax
 {
-    template <int channels>
-    struct DistRgbMax
-    {
    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
    {
-            uchar x = abs(a[0] - b[0]);
-            uchar y = abs(a[1] - b[1]);
-            uchar z = abs(a[2] - b[2]);
-            return (max(max(x, y), z));
+        uchar x = ::abs(a[0] - b[0]);
+        uchar y = ::abs(a[1] - b[1]);
+        uchar z = ::abs(a[2] - b[2]);
+        return (::max(::max(x, y), z));
    }
-    };
+};

-    template <>
-    struct DistRgbMax<1>
-    {
+template <>
+struct DistRgbMax<1>
+{
    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
    {
-            return abs(a[0] - b[0]);
+        return ::abs(a[0] - b[0]);
    }
-    };
+};

-    template <int channels, typename T>
-    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-    {
+template <int channels, typename T>
+__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+{
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);

@@ -115,12 +108,12 @@ namespace bf_krnls
        dp[3] = *(disp + (y+1) * disp_step + x + 0);
        dp[4] = *(disp + (y  ) * disp_step + x + 1);

-            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
+        if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
        {
-                const int ymin = max(0, y - cradius);
-                const int xmin = max(0, x - cradius);
-                const int ymax = min(h - 1, y + cradius);
-                const int xmax = min(w - 1, x + cradius);
+            const int ymin = ::max(0, y - cradius);
+            const int xmin = ::max(0, x - cradius);
+            const int ymax = ::min(h - 1, y + cradius);
+            const int xmax = ::min(w - 1, x + cradius);

            float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};

@@ -136,15 +129,15 @@ namespace bf_krnls

                    uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);

-                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
+                    const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];

                    const T disp_reg = disp_y[xi];

-                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
-                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
-                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
-                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
-                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
+                    cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                    cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                    cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                    cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                    cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
                }
            }

@@ -180,14 +173,11 @@ namespace bf_krnls
            *(disp + y * disp_step + x) = dp[id];
        }
    }
-    }
 }

-namespace cv { namespace gpu { namespace bf 
+template <typename T>     
+void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
 {
-    template <typename T>     
-    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
    grid.x = divUp(disp.cols, threads.x << 1);
@@ -198,18 +188,20 @@ namespace cv { namespace gpu { namespace bf
    case 1:
        for (int i = 0; i < iters; ++i)
        {
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
            cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
            cudaSafeCall( cudaGetLastError() );
        }
        break;
    case 3:
        for (int i = 0; i < iters; ++i)
        {
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
            cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
            cudaSafeCall( cudaGetLastError() );
        }
        break;
@@ -219,15 +211,18 @@ namespace cv { namespace gpu { namespace bf

    if (stream != 0)
        cudaSafeCall( cudaDeviceSynchronize() );
-    }
+}

-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
+void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+{
    bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
+}

-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
+void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+{
    bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-}}}
+}
+
+} // namespace bilateral_filter
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
--- a/modules/gpu/test/test_hog.cpp
+++ b/modules/gpu/test/test_hog.cpp
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
--- a/samples/gpu/stereo_match.cpp
+++ b/samples/gpu/stereo_match.cpp
--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp