Merge pull request #974 from jet47:gpu-core-refactoring

81c6b46f · Roman Donchenko · OpenCV Buildbot · e16af9bd · 4f0d72bf · 81c6b46f
93 changed file
--- a/doc/check_docs2.py
+++ b/doc/check_docs2.py
@@ -201,9 +201,9 @@ def process_module(module, path):
            hdrlist.append(os.path.join(root, filename))

    if module == "gpu":
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "cuda_devptrs.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpumat.hpp"))
-        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "stream_accessor.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_types.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu.hpp"))
+        hdrlist.append(os.path.join(path, "..", "core", "include", "opencv2", "core", "gpu_stream_accessor.hpp"))

    decls = []
    for hname in hdrlist:

--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -493,6 +493,9 @@ namespace ogl
 namespace gpu
 {
    class CV_EXPORTS GpuMat;
+    class CV_EXPORTS CudaMem;
+    class CV_EXPORTS Stream;
+    class CV_EXPORTS Event;
 }

 } // cv

--- a/modules/core/include/opencv2/core/cuda/common.hpp
+++ b/modules/core/include/opencv2/core/cuda/common.hpp
@@ -44,7 +44,7 @@
 #define __OPENCV_GPU_COMMON_HPP__

 #include <cuda_runtime.h>
-#include "opencv2/core/cuda_devptrs.hpp"
+#include "opencv2/core/gpu_types.hpp"
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"


--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
--- a/modules/core/include/opencv2/core/gpu.inl.hpp
+++ b/modules/core/include/opencv2/core/gpu.inl.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_GPUINL_HPP__
+#define __OPENCV_CORE_GPUINL_HPP__
+
+#include "opencv2/core/gpu.hpp"
+
+namespace cv { namespace gpu {
+
+//////////////////////////////// GpuMat ///////////////////////////////
+
+inline
+GpuMat::GpuMat()
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (rows_ > 0 && cols_ > 0)
+    {
+        create(rows_, cols_, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(Size size_, int type_, Scalar s_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    if (size_.height > 0 && size_.width > 0)
+    {
+        create(size_.height, size_.width, type_);
+        setTo(s_);
+    }
+}
+
+inline
+GpuMat::GpuMat(const GpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+inline
+GpuMat::GpuMat(InputArray arr) :
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+{
+    upload(arr);
+}
+
+inline
+GpuMat::~GpuMat()
+{
+    release();
+}
+
+inline
+GpuMat& GpuMat::operator =(const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void GpuMat::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+void GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+}
+
+inline
+GpuMat GpuMat::clone() const
+{
+    GpuMat m;
+    copyTo(m);
+    return m;
+}
+
+inline
+void GpuMat::copyTo(OutputArray dst, InputArray mask) const
+{
+    copyTo(dst, mask, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s)
+{
+    return setTo(s, Stream::Null());
+}
+
+inline
+GpuMat& GpuMat::setTo(Scalar s, InputArray mask)
+{
+    return setTo(s, mask, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype) const
+{
+    convertTo(dst, rtype, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, double beta) const
+{
+    convertTo(dst, rtype, alpha, beta, Stream::Null());
+}
+
+inline
+void GpuMat::convertTo(OutputArray dst, int rtype, double alpha, Stream& stream) const
+{
+    convertTo(dst, rtype, alpha, 0.0, stream);
+}
+
+inline
+void GpuMat::assignTo(GpuMat& m, int _type) const
+{
+    if (_type < 0)
+        m = *this;
+    else
+        convertTo(m, _type);
+}
+
+inline
+uchar* GpuMat::ptr(int y)
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+inline
+const uchar* GpuMat::ptr(int y) const
+{
+    CV_DbgAssert( (unsigned)y < (unsigned)rows );
+    return data + step * y;
+}
+
+template<typename _Tp> inline
+_Tp* GpuMat::ptr(int y)
+{
+    return (_Tp*)ptr(y);
+}
+
+template<typename _Tp> inline
+const _Tp* GpuMat::ptr(int y) const
+{
+    return (const _Tp*)ptr(y);
+}
+
+template <class T> inline
+GpuMat::operator PtrStepSz<T>() const
+{
+    return PtrStepSz<T>(rows, cols, (T*)data, step);
+}
+
+template <class T> inline
+GpuMat::operator PtrStep<T>() const
+{
+    return PtrStep<T>((T*)data, step);
+}
+
+inline
+GpuMat GpuMat::row(int y) const
+{
+    return GpuMat(*this, Range(y, y+1), Range::all());
+}
+
+inline
+GpuMat GpuMat::col(int x) const
+{
+    return GpuMat(*this, Range::all(), Range(x, x+1));
+}
+
+inline
+GpuMat GpuMat::rowRange(int startrow, int endrow) const
+{
+    return GpuMat(*this, Range(startrow, endrow), Range::all());
+}
+
+inline
+GpuMat GpuMat::rowRange(Range r) const
+{
+    return GpuMat(*this, r, Range::all());
+}
+
+inline
+GpuMat GpuMat::colRange(int startcol, int endcol) const
+{
+    return GpuMat(*this, Range::all(), Range(startcol, endcol));
+}
+
+inline
+GpuMat GpuMat::colRange(Range r) const
+{
+    return GpuMat(*this, Range::all(), r);
+}
+
+inline
+GpuMat GpuMat::operator ()(Range rowRange_, Range colRange_) const
+{
+    return GpuMat(*this, rowRange_, colRange_);
+}
+
+inline
+GpuMat GpuMat::operator ()(Rect roi) const
+{
+    return GpuMat(*this, roi);
+}
+
+inline
+bool GpuMat::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t GpuMat::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t GpuMat::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int GpuMat::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int GpuMat::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int GpuMat::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t GpuMat::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size GpuMat::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool GpuMat::empty() const
+{
+    return data == 0;
+}
+
+static inline
+GpuMat createContinuous(int rows, int cols, int type)
+{
+    GpuMat m;
+    createContinuous(rows, cols, type, m);
+    return m;
+}
+
+static inline
+void createContinuous(Size size, int type, OutputArray arr)
+{
+    createContinuous(size.height, size.width, type, arr);
+}
+
+static inline
+GpuMat createContinuous(Size size, int type)
+{
+    GpuMat m;
+    createContinuous(size, type, m);
+    return m;
+}
+
+static inline
+void ensureSizeIsEnough(Size size, int type, OutputArray arr)
+{
+    ensureSizeIsEnough(size.height, size.width, type, arr);
+}
+
+static inline
+void swap(GpuMat& a, GpuMat& b)
+{
+    a.swap(b);
+}
+
+//////////////////////////////// CudaMem ////////////////////////////////
+
+inline
+CudaMem::CudaMem(AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+}
+
+inline
+CudaMem::CudaMem(const CudaMem& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+inline
+CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline
+CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline
+CudaMem::CudaMem(InputArray arr, AllocType alloc_type_)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_)
+{
+    arr.getMat().copyTo(*this);
+}
+
+inline
+CudaMem::~CudaMem()
+{
+    release();
+}
+
+inline
+CudaMem& CudaMem::operator =(const CudaMem& m)
+{
+    if (this != &m)
+    {
+        CudaMem temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline
+void CudaMem::swap(CudaMem& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+    std::swap(alloc_type, b.alloc_type);
+}
+
+inline
+CudaMem CudaMem::clone() const
+{
+    CudaMem m(size(), type(), alloc_type);
+    createMatHeader().copyTo(m);
+    return m;
+}
+
+inline
+void CudaMem::create(Size size_, int type_)
+{
+    create(size_.height, size_.width, type_);
+}
+
+inline
+Mat CudaMem::createMatHeader() const
+{
+    return Mat(size(), type(), data, step);
+}
+
+inline
+bool CudaMem::isContinuous() const
+{
+    return (flags & Mat::CONTINUOUS_FLAG) != 0;
+}
+
+inline
+size_t CudaMem::elemSize() const
+{
+    return CV_ELEM_SIZE(flags);
+}
+
+inline
+size_t CudaMem::elemSize1() const
+{
+    return CV_ELEM_SIZE1(flags);
+}
+
+inline
+int CudaMem::type() const
+{
+    return CV_MAT_TYPE(flags);
+}
+
+inline
+int CudaMem::depth() const
+{
+    return CV_MAT_DEPTH(flags);
+}
+
+inline
+int CudaMem::channels() const
+{
+    return CV_MAT_CN(flags);
+}
+
+inline
+size_t CudaMem::step1() const
+{
+    return step / elemSize1();
+}
+
+inline
+Size CudaMem::size() const
+{
+    return Size(cols, rows);
+}
+
+inline
+bool CudaMem::empty() const
+{
+    return data == 0;
+}
+
+static inline
+void swap(CudaMem& a, CudaMem& b)
+{
+    a.swap(b);
+}
+
+//////////////////////////////// Stream ///////////////////////////////
+
+inline
+void Stream::enqueueDownload(const GpuMat& src, OutputArray dst)
+{
+    src.download(dst, *this);
+}
+
+inline
+void Stream::enqueueUpload(InputArray src, GpuMat& dst)
+{
+    dst.upload(src, *this);
+}
+
+inline
+void Stream::enqueueCopy(const GpuMat& src, OutputArray dst)
+{
+    src.copyTo(dst, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val)
+{
+    src.setTo(val, *this);
+}
+
+inline
+void Stream::enqueueMemSet(GpuMat& src, Scalar val, InputArray mask)
+{
+    src.setTo(val, mask, *this);
+}
+
+inline
+void Stream::enqueueConvert(const GpuMat& src, OutputArray dst, int dtype, double alpha, double beta)
+{
+    src.convertTo(dst, dtype, alpha, beta, *this);
+}
+
+inline
+Stream::Stream(const Ptr<Impl>& impl)
+    : impl_(impl)
+{
+}
+
+//////////////////////////////// Initialization & Info ////////////////////////
+
+inline
+bool TargetArchs::has(int major, int minor)
+{
+    return hasPtx(major, minor) || hasBin(major, minor);
+}
+
+inline
+bool TargetArchs::hasEqualOrGreater(int major, int minor)
+{
+    return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
+}
+
+inline
+DeviceInfo::DeviceInfo()
+{
+    device_id_ = getDevice();
+}
+
+inline
+DeviceInfo::DeviceInfo(int device_id)
+{
+    CV_Assert( device_id >= 0 && device_id < getCudaEnabledDeviceCount() );
+    device_id_ = device_id;
+}
+
+inline
+int DeviceInfo::deviceID() const
+{
+    return device_id_;
+}
+
+inline
+size_t DeviceInfo::freeMemory() const
+{
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _freeMemory;
+}
+
+inline
+size_t DeviceInfo::totalMemory() const
+{
+    size_t _totalMemory, _freeMemory;
+    queryMemory(_totalMemory, _freeMemory);
+    return _totalMemory;
+}
+
+inline
+bool DeviceInfo::supports(FeatureSet feature_set) const
+{
+    int version = major() * 10 + minor();
+    return version >= feature_set;
+}
+
+}} // namespace cv { namespace gpu {
+
+//////////////////////////////// Mat ////////////////////////////////
+
+namespace cv {
+
+inline
+Mat::Mat(const gpu::GpuMat& m)
+    : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
+{
+    m.download(*this);
+}
+
+}
+
+#endif // __OPENCV_CORE_GPUINL_HPP__
--- a/modules/core/include/opencv2/core/stream_accessor.hpp
+++ b/modules/core/include/opencv2/core/stream_accessor.hpp
@@ -40,28 +40,38 @@
 //
 //M*/

-#ifndef __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
-#define __OPENCV_CUDA_STREAM_ACCESSOR_HPP__
+#ifndef __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__

-#include <cuda_runtime.h>
-#include "opencv2/core/cvdef.h"
+#ifndef __cplusplus
+#  error gpu_stream_accessor.hpp header must be compiled as C++
+#endif

 // This is only header file that depends on Cuda. All other headers are independent.
 // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
 // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
 // In this case you have to install Cuda Toolkit.

+#include <cuda_runtime.h>
+#include "opencv2/core/cvdef.h"
+
 namespace cv
 {
    namespace gpu
    {
        class Stream;
+        class Event;

        struct StreamAccessor
        {
            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
        };
+
+        struct EventAccessor
+        {
+            CV_EXPORTS static cudaEvent_t getEvent(const Event& event);
+        };
    }
 }

-#endif /* __OPENCV_CUDA_STREAM_ACCESSOR_HPP__ */
+#endif /* __OPENCV_CORE_GPU_STREAM_ACCESSOR_HPP__ */
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@@ -40,10 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_DEVPTRS_HPP__
-#define __OPENCV_CORE_DEVPTRS_HPP__
+#ifndef __OPENCV_CORE_GPU_TYPES_HPP__
+#define __OPENCV_CORE_GPU_TYPES_HPP__

-#ifdef __cplusplus
+#ifndef __cplusplus
+#  error gpu_types.hpp header must be compiled as C++
+#endif

 #ifdef __CUDACC__
    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
@@ -58,7 +60,7 @@ namespace cv
        // Simple lightweight structures that encapsulates information about an image on device.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

-        template<typename T> struct DevPtr
+        template <typename T> struct DevPtr
        {
            typedef T elem_type;
            typedef int index_type;
@@ -75,7 +77,7 @@ namespace cv
            __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
        };

-        template<typename T> struct PtrSz : public DevPtr<T>
+        template <typename T> struct PtrSz : public DevPtr<T>
        {
            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
@@ -83,12 +85,12 @@ namespace cv
            size_t size;
        };

-        template<typename T> struct PtrStep : public DevPtr<T>
+        template <typename T> struct PtrStep : public DevPtr<T>
        {
            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
            __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}

-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+            //! stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!!
            size_t step;

            __CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
@@ -118,36 +120,7 @@ namespace cv
        typedef PtrStep<unsigned char> PtrStepb;
        typedef PtrStep<float> PtrStepf;
        typedef PtrStep<int> PtrStepi;
-
-
-#if defined __GNUC__
-    #define __CV_GPU_DEPR_BEFORE__
-    #define __CV_GPU_DEPR_AFTER__ __attribute__ ((deprecated))
-#elif defined(__MSVC__) //|| defined(__CUDACC__)
-    #pragma deprecated(DevMem2D_)
-    #define __CV_GPU_DEPR_BEFORE__ __declspec(deprecated)
-    #define __CV_GPU_DEPR_AFTER__
-#else
-    #define __CV_GPU_DEPR_BEFORE__
-    #define __CV_GPU_DEPR_AFTER__
-#endif
-
-        template <typename T> struct __CV_GPU_DEPR_BEFORE__ DevMem2D_ : public PtrStepSz<T>
-        {
-            DevMem2D_() {}
-            DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
-
-            template <typename U>
-            explicit __CV_GPU_DEPR_BEFORE__ DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}
-        } __CV_GPU_DEPR_AFTER__ ;
-
-        typedef DevMem2D_<unsigned char> DevMem2Db;
-        typedef DevMem2Db DevMem2D;
-        typedef DevMem2D_<float> DevMem2Df;
-        typedef DevMem2D_<int> DevMem2Di;
    }
 }

-#endif // __cplusplus
-
-#endif /* __OPENCV_CORE_DEVPTRS_HPP__ */
+#endif /* __OPENCV_CORE_GPU_TYPES_HPP__ */
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -77,7 +77,7 @@ public:
        STD_VECTOR_MAT    = 5 << KIND_SHIFT,
        EXPR              = 6 << KIND_SHIFT,
        OPENGL_BUFFER     = 7 << KIND_SHIFT,
-        OPENGL_TEXTURE    = 8 << KIND_SHIFT,
+        CUDA_MEM          = 8 << KIND_SHIFT,
        GPU_MAT           = 9 << KIND_SHIFT
    };

@@ -94,13 +94,12 @@ public:
    _InputArray(const double& val);
    _InputArray(const gpu::GpuMat& d_mat);
    _InputArray(const ogl::Buffer& buf);
-    _InputArray(const ogl::Texture2D& tex);
+    _InputArray(const gpu::CudaMem& cuda_mem);

    virtual Mat getMat(int i=-1) const;
    virtual void getMatVector(std::vector<Mat>& mv) const;
    virtual gpu::GpuMat getGpuMat() const;
    virtual ogl::Buffer getOGlBuffer() const;
-    virtual ogl::Texture2D getOGlTexture2D() const;

    virtual int kind() const;
    virtual Size size(int i=-1) const;
@@ -143,7 +142,7 @@ public:
    _OutputArray(std::vector<Mat>& vec);
    _OutputArray(gpu::GpuMat& d_mat);
    _OutputArray(ogl::Buffer& buf);
-    _OutputArray(ogl::Texture2D& tex);
+    _OutputArray(gpu::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
    template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
@@ -155,7 +154,7 @@ public:
    _OutputArray(const std::vector<Mat>& vec);
    _OutputArray(const gpu::GpuMat& d_mat);
    _OutputArray(const ogl::Buffer& buf);
-    _OutputArray(const ogl::Texture2D& tex);
+    _OutputArray(const gpu::CudaMem& cuda_mem);
    template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
    template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
    template<typename _Tp> _OutputArray(const std::vector<Mat_<_Tp> >& vec);
@@ -169,7 +168,7 @@ public:
    virtual Mat& getMatRef(int i=-1) const;
    virtual gpu::GpuMat& getGpuMatRef() const;
    virtual ogl::Buffer& getOGlBufferRef() const;
-    virtual ogl::Texture2D& getOGlTexture2DRef() const;
+    virtual gpu::CudaMem& getCudaMemRef() const;
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;

--- a/modules/core/include/opencv2/core/opengl.hpp
+++ b/modules/core/include/opencv2/core/opengl.hpp
@@ -40,8 +40,12 @@
 //
 //M*/

-#ifndef __OPENCV_OPENGL_INTEROP_HPP__
-#define __OPENCV_OPENGL_INTEROP_HPP__
+#ifndef __OPENCV_CORE_OPENGL_HPP__
+#define __OPENCV_CORE_OPENGL_HPP__
+
+#ifndef __cplusplus
+#  error opengl.hpp header must be compiled as C++
+#endif

 #include "opencv2/core.hpp"

@@ -84,7 +88,7 @@ public:

    //! create buffer
    void create(int arows, int acols, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);
-    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false) { create(asize.height, asize.width, atype, target, autoRelease); }
+    void create(Size asize, int atype, Target target = ARRAY_BUFFER, bool autoRelease = false);

    //! release memory and delete buffer object
    void release();
@@ -92,11 +96,15 @@ public:
    //! set auto release mode (if true, release will be called in object's destructor)
    void setAutoRelease(bool flag);

-    //! copy from host/device memory
+    //! copy from host/device memory (blocking)
    void copyFrom(InputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false);
+    //! copy from device memory (non blocking)
+    void copyFrom(InputArray arr, gpu::Stream& stream, Target target = ARRAY_BUFFER, bool autoRelease = false);

-    //! copy to host/device memory
-    void copyTo(OutputArray arr, Target target = ARRAY_BUFFER, bool autoRelease = false) const;
+    //! copy to host/device memory (blocking)
+    void copyTo(OutputArray arr) const;
+    //! copy to device memory (non blocking)
+    void copyTo(OutputArray arr, gpu::Stream& stream) const;

    //! create copy of current buffer
    Buffer clone(Target target = ARRAY_BUFFER, bool autoRelease = false) const;
@@ -111,21 +119,26 @@ public:
    Mat mapHost(Access access);
    void unmapHost();

-    //! map to device memory
+    //! map to device memory (blocking)
    gpu::GpuMat mapDevice();
    void unmapDevice();

-    int rows() const { return rows_; }
-    int cols() const { return cols_; }
-    Size size() const { return Size(cols_, rows_); }
-    bool empty() const { return rows_ == 0 || cols_ == 0; }
+    //! map to device memory (non blocking)
+    gpu::GpuMat mapDevice(gpu::Stream& stream);
+    void unmapDevice(gpu::Stream& stream);

-    int type() const { return type_; }
-    int depth() const { return CV_MAT_DEPTH(type_); }
-    int channels() const { return CV_MAT_CN(type_); }
-    int elemSize() const { return CV_ELEM_SIZE(type_); }
-    int elemSize1() const { return CV_ELEM_SIZE1(type_); }
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;

+    int type() const;
+    int depth() const;
+    int channels() const;
+    int elemSize() const;
+    int elemSize1() const;
+
+    //! get OpenGL opject id
    unsigned int bufId() const;

    class Impl;
@@ -165,7 +178,7 @@ public:

    //! create texture
    void create(int arows, int acols, Format aformat, bool autoRelease = false);
-    void create(Size asize, Format aformat, bool autoRelease = false) { create(asize.height, asize.width, aformat, autoRelease); }
+    void create(Size asize, Format aformat, bool autoRelease = false);

    //! release memory and delete texture object
    void release();
@@ -182,13 +195,14 @@ public:
    //! bind texture to current active texture unit for GL_TEXTURE_2D target
    void bind() const;

-    int rows() const { return rows_; }
-    int cols() const { return cols_; }
-    Size size() const { return Size(cols_, rows_); }
-    bool empty() const { return rows_ == 0 || cols_ == 0; }
+    int rows() const;
+    int cols() const;
+    Size size() const;
+    bool empty() const;

-    Format format() const { return format_; }
+    Format format() const;

+    //! get OpenGL opject id
    unsigned int texId() const;

    class Impl;
@@ -224,8 +238,8 @@ public:

    void bind() const;

-    int size() const { return size_; }
-    bool empty() const { return size_ == 0; }
+    int size() const;
+    bool empty() const;

 private:
    int size_;
@@ -260,14 +274,14 @@ enum {
 CV_EXPORTS void render(const Arrays& arr, int mode = POINTS, Scalar color = Scalar::all(255));
 CV_EXPORTS void render(const Arrays& arr, InputArray indices, int mode = POINTS, Scalar color = Scalar::all(255));

-}} // namespace cv::gl
+}} // namespace cv::ogl

 namespace cv { namespace gpu {

 //! set a CUDA device to use OpenGL interoperability
 CV_EXPORTS void setGlDevice(int device = 0);

-}} // cv::gpu
+}}

 namespace cv {

@@ -276,4 +290,149 @@ template <> CV_EXPORTS void Ptr<cv::ogl::Texture2D::Impl>::delete_obj();

 }

-#endif // __OPENCV_OPENGL_INTEROP_HPP__
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+inline
+cv::ogl::Buffer::Buffer(int arows, int acols, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(arows, acols, atype, target, autoRelease);
+}
+
+inline
+cv::ogl::Buffer::Buffer(Size asize, int atype, Target target, bool autoRelease) : rows_(0), cols_(0), type_(0)
+{
+    create(asize, atype, target, autoRelease);
+}
+
+inline
+void cv::ogl::Buffer::create(Size asize, int atype, Target target, bool autoRelease)
+{
+    create(asize.height, asize.width, atype, target, autoRelease);
+}
+
+inline
+int cv::ogl::Buffer::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Buffer::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Buffer::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Buffer::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+int cv::ogl::Buffer::type() const
+{
+    return type_;
+}
+
+inline
+int cv::ogl::Buffer::depth() const
+{
+    return CV_MAT_DEPTH(type_);
+}
+
+inline
+int cv::ogl::Buffer::channels() const
+{
+    return CV_MAT_CN(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize() const
+{
+    return CV_ELEM_SIZE(type_);
+}
+
+inline
+int cv::ogl::Buffer::elemSize1() const
+{
+    return CV_ELEM_SIZE1(type_);
+}
+
+///////
+
+inline
+cv::ogl::Texture2D::Texture2D(int arows, int acols, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(arows, acols, aformat, autoRelease);
+}
+
+inline
+cv::ogl::Texture2D::Texture2D(Size asize, Format aformat, bool autoRelease) : rows_(0), cols_(0), format_(NONE)
+{
+    create(asize, aformat, autoRelease);
+}
+
+inline
+void cv::ogl::Texture2D::create(Size asize, Format aformat, bool autoRelease)
+{
+    create(asize.height, asize.width, aformat, autoRelease);
+}
+
+inline
+int cv::ogl::Texture2D::rows() const
+{
+    return rows_;
+}
+
+inline
+int cv::ogl::Texture2D::cols() const
+{
+    return cols_;
+}
+
+inline
+cv::Size cv::ogl::Texture2D::size() const
+{
+    return Size(cols_, rows_);
+}
+
+inline
+bool cv::ogl::Texture2D::empty() const
+{
+    return rows_ == 0 || cols_ == 0;
+}
+
+inline
+cv::ogl::Texture2D::Format cv::ogl::Texture2D::format() const
+{
+    return format_;
+}
+
+///////
+
+inline
+cv::ogl::Arrays::Arrays() : size_(0)
+{
+}
+
+inline
+int cv::ogl::Arrays::size() const
+{
+    return size_;
+}
+
+inline
+bool cv::ogl::Arrays::empty() const
+{
+    return size_ == 0;
+}
+
+#endif /* __OPENCV_CORE_OPENGL_HPP__ */
--- a/modules/core/include/opencv2/core/gpu_private.hpp
+++ b/modules/core/include/opencv2/core/gpu_private.hpp
@@ -41,8 +41,8 @@
 //
 //M*/

-#ifndef __OPENCV_CORE_GPU_PRIVATE_HPP__
-#define __OPENCV_CORE_GPU_PRIVATE_HPP__
+#ifndef __OPENCV_CORE_PRIVATE_GPU_HPP__
+#define __OPENCV_CORE_PRIVATE_GPU_HPP__

 #ifndef __OPENCV_BUILD
 #  error this is a private header which should not be used from outside of the OpenCV library
@@ -53,11 +53,13 @@
 #include "opencv2/core/cvdef.h"
 #include "opencv2/core/base.hpp"

+#include "opencv2/core/gpu.hpp"
+
 #ifdef HAVE_CUDA
 #  include <cuda.h>
 #  include <cuda_runtime.h>
 #  include <npp.h>
-#  include "opencv2/core/stream_accessor.hpp"
+#  include "opencv2/core/gpu_stream_accessor.hpp"
 #  include "opencv2/core/cuda/common.hpp"

 #  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)

--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -44,188 +44,113 @@
 #include "opencv2/core/cuda/transform.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/type_traits.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"

-namespace cv { namespace gpu { namespace cudev
-{
-    void writeScalar(const uchar*);
-    void writeScalar(const schar*);
-    void writeScalar(const ushort*);
-    void writeScalar(const short int*);
-    void writeScalar(const int*);
-    void writeScalar(const float*);
-    void writeScalar(const double*);
-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
-    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
-}}}
+#include "matrix_operations.hpp"

 namespace cv { namespace gpu { namespace cudev
 {
-    template <typename T> struct shift_and_sizeof;
-    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// CopyTo /////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////
+    // copyWithMask

-    template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
+    template <typename T>
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
    {
-        if (colorMask)
-            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
+        if (multiChannelMask)
+            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMask(mask), stream);
        else
-            cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
+            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
    }

-    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
    {
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);

-        static func_t tab[] =
+        static const func_t tab[] =
        {
            0,
-            copyToWithMask<unsigned char>,
-            copyToWithMask<unsigned short>,
+            copyWithMask<uchar>,
+            copyWithMask<ushort>,
            0,
-            copyToWithMask<int>,
+            copyWithMask<int>,
            0,
            0,
            0,
-            copyToWithMask<double>
+            copyWithMask<double>
        };

-        tab[elemSize1](src, dst, cn, mask, colorMask, stream);
+        const func_t func = tab[elemSize1];
+        CV_DbgAssert( func != 0 );
+
+        func(src, dst, cn, mask, multiChannelMask, stream);
    }

    ///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// SetTo //////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
-
-    __constant__ uchar scalar_8u[4];
-    __constant__ schar scalar_8s[4];
-    __constant__ ushort scalar_16u[4];
-    __constant__ short scalar_16s[4];
-    __constant__ int scalar_32s[4];
-    __constant__ float scalar_32f[4];
-    __constant__ double scalar_64f[4];
-
-    template <typename T> __device__ __forceinline__ T readScalar(int i);
-    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
-
-    void writeScalar(const uchar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-    }
-    void writeScalar(const schar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-    }
-    void writeScalar(const ushort* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-    }
-    void writeScalar(const short* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-    }
-    void writeScalar(const int* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-    }
-    void writeScalar(const float* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-    }
-    void writeScalar(const double* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
-    }
+    // set

-    template<typename T>
-    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
+    template<typename T, class Mask>
+    __global__ void set(PtrStepSz<T> mat, const Mask mask, const int channels, const typename TypeVec<T, 4>::vec_type value)
    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-        if ((x < cols * channels ) && (y < rows))
-        {
-            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-            mat[idx] = readScalar<T>(x % channels);
-        }
-    }
+        if (x >= mat.cols * channels || y >= mat.rows)
+            return;

-    template<typename T>
-    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+        const T scalar[4] = {value.x, value.y, value.z, value.w};

-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = readScalar<T>(x % channels);
-            }
+        if (mask(y, x / channels))
+            mat(y, x) = scalar[x % channels];
    }
+
    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
+    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream)
    {
-        writeScalar(scalar);
+        typedef typename TypeVec<T, 4>::vec_type scalar_t;

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+        dim3 block(32, 8);
+        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));

-        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+        set<T><<<grid, block, 0, stream>>>(mat, WithOutMask(), channels, VecTraits<scalar_t>::make(scalar));
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, int channels, cudaStream_t stream);
+    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, int channels, cudaStream_t stream);
+    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set<short >(PtrStepSz<short > mat, const short*  scalar, int channels, cudaStream_t stream);
+    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, int channels, cudaStream_t stream);
+    template void set<float >(PtrStepSz<float > mat, const float*  scalar, int channels, cudaStream_t stream);
+    template void set<double>(PtrStepSz<double> mat, const double* scalar, int channels, cudaStream_t stream);

    template <typename T>
-    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream)
+    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
    {
-        writeScalar(scalar);
+        typedef typename TypeVec<T, 4>::vec_type scalar_t;

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+        dim3 block(32, 8);
+        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));

-        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+        set<T><<<grid, block, 0, stream>>>(mat, SingleMask(mask), channels, VecTraits<scalar_t>::make(scalar));
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall ( cudaDeviceSynchronize() );
    }

-    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream);
+    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<short >(PtrStepSz<short > mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<float >(PtrStepSz<float > mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+    template void set<double>(PtrStepSz<double> mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);

    ///////////////////////////////////////////////////////////////////////////
-    //////////////////////////////// ConvertTo ////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+    // convert

    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
    {
@@ -290,18 +215,11 @@ namespace cv { namespace gpu { namespace cudev
    template<typename T, typename D, typename S>
    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
        cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }

-#if defined  __clang__
-# pragma clang diagnostic push
-# pragma clang diagnostic ignored "-Wmissing-declarations"
-#endif
-
-    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
+    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);

@@ -372,11 +290,7 @@ namespace cv { namespace gpu { namespace cudev
            }
        };

-        caller_t func = tab[sdepth][ddepth];
+        const caller_t func = tab[sdepth][ddepth];
        func(src, dst, alpha, beta, stream);
    }
-
-#if defined __clang__
-# pragma clang diagnostic pop
-#endif
 }}} // namespace cv { namespace gpu { namespace cudev
--- a/modules/core/src/cuda/matrix_operations.hpp
+++ b/modules/core/src/cuda/matrix_operations.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);
+
+    template <typename T>
+    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
+
+    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+}}}
--- a/modules/core/src/cudastream.cpp
+++ b/modules/core/src/cudastream.cpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA)
-
-cv::gpu::Stream::Stream() { throw_no_cuda(); }
-cv::gpu::Stream::~Stream() {}
-cv::gpu::Stream::Stream(const Stream&) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::operator=(const Stream&) { throw_no_cuda(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_no_cuda(); return false; }
-void cv::gpu::Stream::waitForCompletion() { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, Mat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat&, CudaMem&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueUpload(const Mat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat&, Scalar, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat&, GpuMat&, int, double, double) { throw_no_cuda(); }
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback, void*) { throw_no_cuda(); }
-Stream& cv::gpu::Stream::Null() { throw_no_cuda(); static Stream s; return s; }
-cv::gpu::Stream::operator bool() const { throw_no_cuda(); return false; }
-cv::gpu::Stream::Stream(Impl*) { throw_no_cuda(); }
-void cv::gpu::Stream::create() { throw_no_cuda(); }
-void cv::gpu::Stream::release() { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-}}
-
-struct Stream::Impl
-{
-    static cudaStream_t getStream(const Impl* impl)
-    {
-        return impl ? impl->stream : 0;
-    }
-
-    cudaStream_t stream;
-    int ref_counter;
-};
-
-cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
-{
-    return Stream::Impl::getStream(stream.impl);
-}
-
-cv::gpu::Stream::Stream() : impl(0)
-{
-    create();
-}
-
-cv::gpu::Stream::~Stream()
-{
-    release();
-}
-
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
-{
-    if (impl)
-        CV_XADD(&impl->ref_counter, 1);
-}
-
-Stream& cv::gpu::Stream::operator =(const Stream& stream)
-{
-    if (this != &stream)
-    {
-        release();
-        impl = stream.impl;
-        if (impl)
-            CV_XADD(&impl->ref_counter, 1);
-    }
-
-    return *this;
-}
-
-bool cv::gpu::Stream::queryIfComplete()
-{
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaError_t err = cudaStreamQuery(stream);
-
-    if (err == cudaErrorNotReady || err == cudaSuccess)
-        return err == cudaSuccess;
-
-    cudaSafeCall(err);
-    return false;
-}
-
-void cv::gpu::Stream::waitForCompletion()
-{
-    cudaStream_t stream = Impl::getStream(impl);
-    cudaSafeCall( cudaStreamSynchronize(stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert( src.size() == dst.size() && src.type() == dst.type() );
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst)
-{
-    dst.create(src.size(), src.type(), CudaMem::ALLOC_PAGE_LOCKED);
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToHost, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyHostToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst)
-{
-    dst.create(src.size(), src.type());
-
-    cudaStream_t stream = Impl::getStream(impl);
-    size_t bwidth = src.cols * src.elemSize();
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, cudaMemcpyDeviceToDevice, stream) );
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
-{
-    const int sdepth = src.depth();
-
-    if (sdepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    if (val[0] == 0.0 && val[1] == 0.0 && val[2] == 0.0 && val[3] == 0.0)
-    {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, stream) );
-        return;
-    }
-
-    if (sdepth == CV_8U)
-    {
-        int cn = src.channels();
-
-        if (cn == 1 || (cn == 2 && val[0] == val[1]) || (cn == 3 && val[0] == val[1] && val[0] == val[2]) || (cn == 4 && val[0] == val[1] && val[0] == val[2] && val[0] == val[3]))
-        {
-            int ival = saturate_cast<uchar>(val[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, ival, src.cols * src.elemSize(), src.rows, stream) );
-            return;
-        }
-    }
-
-    setTo(src, val, stream);
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    const int sdepth = src.depth();
-
-    if (sdepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    CV_Assert(mask.type() == CV_8UC1);
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    setTo(src, val, mask, stream);
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int dtype, double alpha, double beta)
-{
-    if (dtype < 0)
-        dtype = src.type();
-    else
-        dtype = CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels());
-
-    const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    bool noScale = fabs(alpha - 1) < std::numeric_limits<double>::epsilon()
-                && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if (sdepth == ddepth && noScale)
-    {
-        enqueueCopy(src, dst);
-        return;
-    }
-
-    dst.create(src.size(), dtype);
-
-    cudaStream_t stream = Impl::getStream(impl);
-    convertTo(src, dst, alpha, beta, stream);
-}
-
-#if CUDART_VERSION >= 5000
-
-namespace
-{
-    struct CallbackData
-    {
-        cv::gpu::Stream::StreamCallback callback;
-        void* userData;
-        Stream stream;
-    };
-
-    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
-    {
-        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
-        data->callback(data->stream, static_cast<int>(status), data->userData);
-        delete data;
-    }
-}
-
-#endif
-
-void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
-{
-#if CUDART_VERSION >= 5000
-    CallbackData* data = new CallbackData;
-    data->callback = callback;
-    data->userData = userData;
-    data->stream = *this;
-
-    cudaStream_t stream = Impl::getStream(impl);
-
-    cudaSafeCall( cudaStreamAddCallback(stream, cudaStreamCallback, data, 0) );
-#else
-    (void) callback;
-    (void) userData;
-    CV_Error(CV_StsNotImplemented, "This function requires CUDA 5.0");
-#endif
-}
-
-cv::gpu::Stream& cv::gpu::Stream::Null()
-{
-    static Stream s((Impl*) 0);
-    return s;
-}
-
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_)
-{
-}
-
-void cv::gpu::Stream::create()
-{
-    if (impl)
-        release();
-
-    cudaStream_t stream;
-    cudaSafeCall( cudaStreamCreate( &stream ) );
-
-    impl = (Stream::Impl*) fastMalloc(sizeof(Stream::Impl));
-
-    impl->stream = stream;
-    impl->ref_counter = 1;
-}
-
-void cv::gpu::Stream::release()
-{
-    if (impl && CV_XADD(&impl->ref_counter, -1) == 1)
-    {
-        cudaSafeCall( cudaStreamDestroy(impl->stream) );
-        cv::fastFree(impl);
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/core/src/matrix_operations.cpp
+++ b/modules/core/src/matrix_operations.cpp
@@ -7,11 +7,12 @@
 //  copy or use the software.
 //
 //
-//                           License Agreement
+//                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
@@ -45,217 +46,70 @@
 using namespace cv;
 using namespace cv::gpu;

-cv::gpu::CudaMem::CudaMem()
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-}
-
-cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _rows > 0 && _cols > 0 )
-        create( _rows, _cols, _type, _alloc_type);
-}
-
-cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _size.height > 0 && _size.width > 0 )
-        create( _size.height, _size.width, _type, _alloc_type);
-}
-
-cv::gpu::CudaMem::CudaMem(const CudaMem& m)
-    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
-{
-    if( refcount )
-        CV_XADD(refcount, 1);
-}
-
-cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( m.rows > 0 && m.cols > 0 )
-        create( m.size(), m.type(), _alloc_type);
-
-    Mat tmp = createMatHeader();
-    m.copyTo(tmp);
-}
-
-cv::gpu::CudaMem::~CudaMem()
-{
-    release();
-}
-
-CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m)
-{
-    if( this != &m )
-    {
-        if( m.refcount )
-            CV_XADD(m.refcount, 1);
-        release();
-        flags = m.flags;
-        rows = m.rows; cols = m.cols;
-        step = m.step; data = m.data;
-        datastart = m.datastart;
-        dataend = m.dataend;
-        refcount = m.refcount;
-        alloc_type = m.alloc_type;
-    }
-    return *this;
-}
-
-CudaMem cv::gpu::CudaMem::clone() const
-{
-    CudaMem m(size(), type(), alloc_type);
-    Mat to = m;
-    Mat from = *this;
-    from.copyTo(to);
-    return m;
-}
-
-void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type)
-{
-    create(_size.height, _size.width, _type, _alloc_type);
-}
-
-Mat cv::gpu::CudaMem::createMatHeader() const
-{
-    return Mat(size(), type(), data, step);
-}
-
-cv::gpu::CudaMem::operator Mat() const
-{
-    return createMatHeader();
-}
-
-cv::gpu::CudaMem::operator GpuMat() const
-{
-    return createGpuMatHeader();
-}
-
-bool cv::gpu::CudaMem::isContinuous() const
-{
-    return (flags & Mat::CONTINUOUS_FLAG) != 0;
-}
-
-size_t cv::gpu::CudaMem::elemSize() const
-{
-    return CV_ELEM_SIZE(flags);
-}
-
-size_t cv::gpu::CudaMem::elemSize1() const
-{
-    return CV_ELEM_SIZE1(flags);
-}
-
-int cv::gpu::CudaMem::type() const
-{
-    return CV_MAT_TYPE(flags);
-}
-
-int cv::gpu::CudaMem::depth() const
-{
-    return CV_MAT_DEPTH(flags);
-}
-
-int cv::gpu::CudaMem::channels() const
-{
-    return CV_MAT_CN(flags);
-}
-
-size_t cv::gpu::CudaMem::step1() const
-{
-    return step/elemSize1();
-}
-
-Size cv::gpu::CudaMem::size() const
-{
-    return Size(cols, rows);
-}
-
-bool cv::gpu::CudaMem::empty() const
-{
-    return data == 0;
-}
-
-#if !defined (HAVE_CUDA)
-
-void cv::gpu::registerPageLocked(Mat&) { throw_no_cuda(); }
-void cv::gpu::unregisterPageLocked(Mat&) { throw_no_cuda(); }
-void cv::gpu::CudaMem::create(int, int, int, int) { throw_no_cuda(); }
-bool cv::gpu::CudaMem::canMapHostMemory() { throw_no_cuda(); return false; }
-void cv::gpu::CudaMem::release() { throw_no_cuda(); }
-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_no_cuda(); return GpuMat(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-void cv::gpu::registerPageLocked(Mat& m)
-{
-    cudaSafeCall( cudaHostRegister(m.ptr(), m.step * m.rows, cudaHostRegisterPortable) );
-}
-
-void cv::gpu::unregisterPageLocked(Mat& m)
-{
-    cudaSafeCall( cudaHostUnregister(m.ptr()) );
-}
-
-bool cv::gpu::CudaMem::canMapHostMemory()
-{
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) );
-    return (prop.canMapHostMemory != 0) ? true : false;
-}
-
 namespace
 {
    size_t alignUpStep(size_t what, size_t alignment)
    {
-        size_t alignMask = alignment-1;
+        size_t alignMask = alignment - 1;
        size_t inverseAlignMask = ~alignMask;
        size_t res = (what + alignMask) & inverseAlignMask;
        return res;
    }
 }

-void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
+void cv::gpu::CudaMem::create(int rows_, int cols_, int type_)
 {
-    if (_alloc_type == ALLOC_ZEROCOPY && !canMapHostMemory())
-        CV_Error(cv::Error::GpuApiCallError, "ZeroCopy is not supported by current device");
+#ifndef HAVE_CUDA
+    (void) rows_;
+    (void) cols_;
+    (void) type_;
+    throw_no_cuda();
+#else
+    if (alloc_type == SHARED)
+    {
+        DeviceInfo devInfo;
+        CV_Assert( devInfo.canMapHostMemory() );
+    }
+
+    type_ &= Mat::TYPE_MASK;

-    _type &= Mat::TYPE_MASK;
-    if( rows == _rows && cols == _cols && type() == _type && data )
+    if (rows == rows_ && cols == cols_ && type() == type_ && data)
        return;
-    if( data )
+
+    if (data)
        release();
-    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
-    if( _rows > 0 && _cols > 0 )
+
+    CV_DbgAssert( rows_ >= 0 && cols_ >= 0 );
+
+    if (rows_ > 0 && cols_ > 0)
    {
-        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + _type;
-        rows = _rows;
-        cols = _cols;
-        step = elemSize()*cols;
-        if (_alloc_type == ALLOC_ZEROCOPY)
+        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_;
+        rows = rows_;
+        cols = cols_;
+        step = elemSize() * cols;
+
+        if (alloc_type == SHARED)
        {
-            cudaDeviceProp prop;
-            cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) );
-            step = alignUpStep(step, prop.textureAlignment);
+            DeviceInfo devInfo;
+            step = alignUpStep(step, devInfo.textureAlignment());
        }
+
        int64 _nettosize = (int64)step*rows;
        size_t nettosize = (size_t)_nettosize;
-        if( _nettosize != (int64)nettosize )
-            CV_Error(CV_StsNoMem, "Too big buffer is allocated");
+
+        if (_nettosize != (int64)nettosize)
+            CV_Error(cv::Error::StsNoMem, "Too big buffer is allocated");
+
        size_t datasize = alignSize(nettosize, (int)sizeof(*refcount));

-        //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount));
-        alloc_type = _alloc_type;
-        void *ptr = 0;
+        void* ptr = 0;

        switch (alloc_type)
        {
-        case ALLOC_PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
-        case ALLOC_ZEROCOPY:       cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );  break;
-        case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break;
-        default:                   CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
+        case PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocDefault) ); break;
+        case SHARED:         cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocMapped) );  break;
+        case WRITE_COMBINED: cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocWriteCombined) ); break;
+        default:             CV_Error(cv::Error::StsBadFlag, "Invalid alloc type");
        }

        datastart = data =  (uchar*)ptr;
@@ -264,31 +118,98 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
        refcount = (int*)cv::fastMalloc(sizeof(*refcount));
        *refcount = 1;
    }
+#endif
 }

-GpuMat cv::gpu::CudaMem::createGpuMatHeader () const
+CudaMem cv::gpu::CudaMem::reshape(int new_cn, int new_rows) const
 {
-    CV_Assert( alloc_type == ALLOC_ZEROCOPY );
+    CudaMem hdr = *this;

-    GpuMat res;
+    int cn = channels();
+    if (new_cn == 0)
+        new_cn = cn;

-    void *pdev;
-    cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) );
-    res = GpuMat(rows, cols, type(), pdev, step);
+    int total_width = cols * cn;
+
+    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
+        new_rows = rows * total_width / new_cn;
+
+    if (new_rows != 0 && new_rows != rows)
+    {
+        int total_size = total_width * rows;
+
+        if (!isContinuous())
+            CV_Error(cv::Error::BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
+
+        if ((unsigned)new_rows > (unsigned)total_size)
+            CV_Error(cv::Error::StsOutOfRange, "Bad new number of rows");
+
+        total_width = total_size / new_rows;
+
+        if (total_width * new_rows != total_size)
+            CV_Error(cv::Error::StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
+
+        hdr.rows = new_rows;
+        hdr.step = total_width * elemSize1();
+    }
+
+    int new_width = total_width / new_cn;
+
+    if (new_width * new_cn != total_width)
+        CV_Error(cv::Error::BadNumChannels, "The total width is not divisible by the new number of channels");
+
+    hdr.cols = new_width;
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);

-    return res;
+    return hdr;
 }

 void cv::gpu::CudaMem::release()
 {
-    if( refcount && CV_XADD(refcount, -1) == 1 )
+#ifdef HAVE_CUDA
+    if (refcount && CV_XADD(refcount, -1) == 1)
    {
-        cudaSafeCall( cudaFreeHost(datastart ) );
+        cudaFreeHost(datastart);
        fastFree(refcount);
    }
+
    data = datastart = dataend = 0;
    step = rows = cols = 0;
    refcount = 0;
+#endif
+}
+
+GpuMat cv::gpu::CudaMem::createGpuMatHeader() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return GpuMat();
+#else
+    CV_Assert( alloc_type == SHARED );
+
+    void *pdev;
+    cudaSafeCall( cudaHostGetDevicePointer(&pdev, data, 0) );
+
+    return GpuMat(rows, cols, type(), pdev, step);
+#endif
 }

-#endif /* !defined (HAVE_CUDA) */
+void cv::gpu::registerPageLocked(Mat& m)
+{
+#ifndef HAVE_CUDA
+    (void) m;
+    throw_no_cuda();
+#else
+    CV_Assert( m.isContinuous() );
+    cudaSafeCall( cudaHostRegister(m.data, m.step * m.rows, cudaHostRegisterPortable) );
+#endif
+}
+
+void cv::gpu::unregisterPageLocked(Mat& m)
+{
+#ifndef HAVE_CUDA
+    (void) m;
+#else
+    cudaSafeCall( cudaHostUnregister(m.data) );
+#endif
+}
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
--- a/modules/core/src/gpu_mat.cpp
+++ b/modules/core/src/gpu_mat.cpp
--- a/modules/core/src/gpu_stream.cpp
+++ b/modules/core/src/gpu_stream.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+////////////////////////////////////////////////////////////////
+// Stream
+
+#ifndef HAVE_CUDA
+
+class cv::gpu::Stream::Impl
+{
+public:
+    Impl(void* ptr = 0)
+    {
+        (void) ptr;
+        throw_no_cuda();
+    }
+};
+
+#else
+
+class cv::gpu::Stream::Impl
+{
+public:
+    cudaStream_t stream;
+
+    Impl();
+    Impl(cudaStream_t stream);
+
+    ~Impl();
+};
+
+cv::gpu::Stream::Impl::Impl() : stream(0)
+{
+    cudaSafeCall( cudaStreamCreate(&stream) );
+}
+
+cv::gpu::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
+{
+}
+
+cv::gpu::Stream::Impl::~Impl()
+{
+    if (stream)
+        cudaStreamDestroy(stream);
+}
+
+cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return stream.impl_->stream;
+}
+
+#endif
+
+cv::gpu::Stream::Stream()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    impl_ = new Impl;
+#endif
+}
+
+bool cv::gpu::Stream::queryIfComplete() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return false;
+#else
+    cudaError_t err = cudaStreamQuery(impl_->stream);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+#endif
+}
+
+void cv::gpu::Stream::waitForCompletion()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
+#endif
+}
+
+void cv::gpu::Stream::waitEvent(const Event& event)
+{
+#ifndef HAVE_CUDA
+    (void) event;
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
+#endif
+}
+
+#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
+
+namespace
+{
+    struct CallbackData
+    {
+        Stream::StreamCallback callback;
+        void* userData;
+
+        CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
+    };
+
+    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
+    {
+        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
+        data->callback(static_cast<int>(status), data->userData);
+        delete data;
+    }
+}
+
+#endif
+
+void cv::gpu::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
+{
+#ifndef HAVE_CUDA
+    (void) callback;
+    (void) userData;
+    throw_no_cuda();
+#else
+    #if CUDART_VERSION < 5000
+        (void) callback;
+        (void) userData;
+        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
+    #else
+        CallbackData* data = new CallbackData(callback, userData);
+
+        cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
+    #endif
+#endif
+}
+
+Stream& cv::gpu::Stream::Null()
+{
+    static Stream s(new Impl(0));
+    return s;
+}
+
+cv::gpu::Stream::operator bool_type() const
+{
+#ifndef HAVE_CUDA
+    return 0;
+#else
+    return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
+#endif
+}
+
+template <> void cv::Ptr<Stream::Impl>::delete_obj()
+{
+    if (obj) delete obj;
+}
+
+////////////////////////////////////////////////////////////////
+// Stream
+
+#ifndef HAVE_CUDA
+
+class cv::gpu::Event::Impl
+{
+public:
+    Impl(unsigned int)
+    {
+        throw_no_cuda();
+    }
+};
+
+#else
+
+class cv::gpu::Event::Impl
+{
+public:
+    cudaEvent_t event;
+
+    Impl(unsigned int flags);
+    ~Impl();
+};
+
+cv::gpu::Event::Impl::Impl(unsigned int flags) : event(0)
+{
+    cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
+}
+
+cv::gpu::Event::Impl::~Impl()
+{
+    if (event)
+        cudaEventDestroy(event);
+}
+
+cudaEvent_t cv::gpu::EventAccessor::getEvent(const Event& event)
+{
+    return event.impl_->event;
+}
+
+#endif
+
+cv::gpu::Event::Event(CreateFlags flags)
+{
+#ifndef HAVE_CUDA
+    (void) flags;
+    throw_no_cuda();
+#else
+    impl_ = new Impl(flags);
+#endif
+}
+
+void cv::gpu::Event::record(Stream& stream)
+{
+#ifndef HAVE_CUDA
+    (void) stream;
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
+#endif
+}
+
+bool cv::gpu::Event::queryIfComplete() const
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    return false;
+#else
+    cudaError_t err = cudaEventQuery(impl_->event);
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+#endif
+}
+
+void cv::gpu::Event::waitForCompletion()
+{
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+#else
+    cudaSafeCall( cudaEventSynchronize(impl_->event) );
+#endif
+}
+
+float cv::gpu::Event::elapsedTime(const Event& start, const Event& end)
+{
+#ifndef HAVE_CUDA
+    (void) start;
+    (void) end;
+    throw_no_cuda();
+    return 0.0f;
+#else
+    float ms;
+    cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
+    return ms;
+#endif
+}
+
+template <> void cv::Ptr<Event::Impl>::delete_obj()
+{
+    if (obj) delete obj;
+}
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -41,8 +41,6 @@
 //M*/

 #include "precomp.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/core/opengl.hpp"

 /****************************************************************************************\
 *                           [scaled] Identity matrix initialization                      *
@@ -941,14 +939,15 @@ void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
 \*************************************************************************************************/

 _InputArray::_InputArray() : flags(0), obj(0) {}
-_InputArray::~_InputArray() {}
 _InputArray::_InputArray(const Mat& m) : flags(MAT), obj((void*)&m) {}
 _InputArray::_InputArray(const std::vector<Mat>& vec) : flags(STD_VECTOR_MAT), obj((void*)&vec) {}
 _InputArray::_InputArray(const double& val) : flags(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F), obj((void*)&val), sz(Size(1,1)) {}
 _InputArray::_InputArray(const MatExpr& expr) : flags(FIXED_TYPE + FIXED_SIZE + EXPR), obj((void*)&expr) {}
 _InputArray::_InputArray(const gpu::GpuMat& d_mat) : flags(GPU_MAT), obj((void*)&d_mat) {}
 _InputArray::_InputArray(const ogl::Buffer& buf) : flags(OPENGL_BUFFER), obj((void*)&buf) {}
-_InputArray::_InputArray(const ogl::Texture2D& tex) : flags(OPENGL_TEXTURE), obj((void*)&tex) {}
+_InputArray::_InputArray(const gpu::CudaMem& cuda_mem) : flags(CUDA_MEM), obj((void*)&cuda_mem) {}
+
+_InputArray::~_InputArray() {}

 Mat _InputArray::getMat(int i) const
 {
@@ -996,14 +995,37 @@ Mat _InputArray::getMat(int i) const
        return !v.empty() ? Mat(size(i), t, (void*)&v[0]) : Mat();
    }

-    CV_Assert( k == STD_VECTOR_MAT );
-    //if( k == STD_VECTOR_MAT )
+    if( k == STD_VECTOR_MAT )
    {
        const std::vector<Mat>& v = *(const std::vector<Mat>*)obj;
        CV_Assert( 0 <= i && i < (int)v.size() );

        return v[i];
    }
+
+    if( k == OPENGL_BUFFER )
+    {
+        CV_Assert( i < 0 );
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call mapHost/unmapHost methods for ogl::Buffer object");
+        return Mat();
+    }
+
+    if( k == GPU_MAT )
+    {
+        CV_Assert( i < 0 );
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call download method for gpu::GpuMat object");
+        return Mat();
+    }
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+
+        return cuda_mem->createMatHeader();
+    }
 }


@@ -1092,10 +1114,29 @@ gpu::GpuMat _InputArray::getGpuMat() const
 {
    int k = kind();

-    CV_Assert(k == GPU_MAT);
+    if (k == GPU_MAT)
+    {
+        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
+        return *d_mat;
+    }
+
+    if (k == CUDA_MEM)
+    {
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        return cuda_mem->createGpuMatHeader();
+    }
+
+    if (k == OPENGL_BUFFER)
+    {
+        CV_Error(cv::Error::StsNotImplemented, "You should explicitly call mapDevice/unmapDevice methods for ogl::Buffer object");
+        return gpu::GpuMat();
+    }

-    const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
-    return *d_mat;
+    if (k == NONE)
+        return gpu::GpuMat();
+
+    CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for gpu::GpuMat and gpu::CudaMem");
+    return gpu::GpuMat();
 }

 ogl::Buffer _InputArray::getOGlBuffer() const
@@ -1108,16 +1149,6 @@ ogl::Buffer _InputArray::getOGlBuffer() const
    return *gl_buf;
 }

-ogl::Texture2D _InputArray::getOGlTexture2D() const
-{
-    int k = kind();
-
-    CV_Assert(k == OPENGL_TEXTURE);
-
-    const ogl::Texture2D* gl_tex = (const ogl::Texture2D*)obj;
-    return *gl_tex;
-}
-
 int _InputArray::kind() const
 {
    return flags & KIND_MASK;
@@ -1186,19 +1217,19 @@ Size _InputArray::size(int i) const
        return buf->size();
    }

-    if( k == OPENGL_TEXTURE )
+    if( k == GPU_MAT )
    {
        CV_Assert( i < 0 );
-        const ogl::Texture2D* tex = (const ogl::Texture2D*)obj;
-        return tex->size();
+        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
+        return d_mat->size();
    }

-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
    {
        CV_Assert( i < 0 );
-        const gpu::GpuMat* d_mat = (const gpu::GpuMat*)obj;
-        return d_mat->size();
+        const gpu::CudaMem* cuda_mem = (const gpu::CudaMem*)obj;
+        return cuda_mem->size();
    }
 }

@@ -1252,9 +1283,12 @@ int _InputArray::type(int i) const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->type();

-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    if( k == GPU_MAT )
        return ((const gpu::GpuMat*)obj)->type();
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+        return ((const gpu::CudaMem*)obj)->type();
 }

 int _InputArray::depth(int i) const
@@ -1304,29 +1338,29 @@ bool _InputArray::empty() const
    if( k == OPENGL_BUFFER )
        return ((const ogl::Buffer*)obj)->empty();

-    if( k == OPENGL_TEXTURE )
-        return ((const ogl::Texture2D*)obj)->empty();
-
-    CV_Assert( k == GPU_MAT );
-    //if( k == GPU_MAT )
+    if( k == GPU_MAT )
        return ((const gpu::GpuMat*)obj)->empty();
+
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+        return ((const gpu::CudaMem*)obj)->empty();
 }


 _OutputArray::_OutputArray() {}
-_OutputArray::~_OutputArray() {}
 _OutputArray::_OutputArray(Mat& m) : _InputArray(m) {}
 _OutputArray::_OutputArray(std::vector<Mat>& vec) : _InputArray(vec) {}
 _OutputArray::_OutputArray(gpu::GpuMat& d_mat) : _InputArray(d_mat) {}
 _OutputArray::_OutputArray(ogl::Buffer& buf) : _InputArray(buf) {}
-_OutputArray::_OutputArray(ogl::Texture2D& tex) : _InputArray(tex) {}
+_OutputArray::_OutputArray(gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {}

 _OutputArray::_OutputArray(const Mat& m) : _InputArray(m) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const std::vector<Mat>& vec) : _InputArray(vec) {flags |= FIXED_SIZE;}
 _OutputArray::_OutputArray(const gpu::GpuMat& d_mat) : _InputArray(d_mat) {flags |= FIXED_SIZE|FIXED_TYPE;}
 _OutputArray::_OutputArray(const ogl::Buffer& buf) : _InputArray(buf) {flags |= FIXED_SIZE|FIXED_TYPE;}
-_OutputArray::_OutputArray(const ogl::Texture2D& tex) : _InputArray(tex) {flags |= FIXED_SIZE|FIXED_TYPE;}
+_OutputArray::_OutputArray(const gpu::CudaMem& cuda_mem) : _InputArray(cuda_mem) {flags |= FIXED_SIZE|FIXED_TYPE;}

+_OutputArray::~_OutputArray() {}

 bool _OutputArray::fixedSize() const
 {
@@ -1362,6 +1396,13 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, int
        ((ogl::Buffer*)obj)->create(_sz, mtype);
        return;
    }
+    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == _sz);
+        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
+        ((gpu::CudaMem*)obj)->create(_sz, mtype);
+        return;
+    }
    int sizes[] = {_sz.height, _sz.width};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@@ -1390,6 +1431,13 @@ void _OutputArray::create(int rows, int cols, int mtype, int i, bool allowTransp
        ((ogl::Buffer*)obj)->create(rows, cols, mtype);
        return;
    }
+    if( k == CUDA_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
+    {
+        CV_Assert(!fixedSize() || ((gpu::CudaMem*)obj)->size() == Size(cols, rows));
+        CV_Assert(!fixedType() || ((gpu::CudaMem*)obj)->type() == mtype);
+        ((gpu::CudaMem*)obj)->create(rows, cols, mtype);
+        return;
+    }
    int sizes[] = {rows, cols};
    create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
 }
@@ -1609,15 +1657,15 @@ void _OutputArray::release() const
        return;
    }

-    if( k == OPENGL_BUFFER )
+    if( k == CUDA_MEM )
    {
-        ((ogl::Buffer*)obj)->release();
+        ((gpu::CudaMem*)obj)->release();
        return;
    }

-    if( k == OPENGL_TEXTURE )
+    if( k == OPENGL_BUFFER )
    {
-        ((ogl::Texture2D*)obj)->release();
+        ((ogl::Buffer*)obj)->release();
        return;
    }

@@ -1693,11 +1741,11 @@ ogl::Buffer& _OutputArray::getOGlBufferRef() const
    return *(ogl::Buffer*)obj;
 }

-ogl::Texture2D& _OutputArray::getOGlTexture2DRef() const
+gpu::CudaMem& _OutputArray::getCudaMemRef() const
 {
    int k = kind();
-    CV_Assert( k == OPENGL_TEXTURE );
-    return *(ogl::Texture2D*)obj;
+    CV_Assert( k == CUDA_MEM );
+    return *(gpu::CudaMem*)obj;
 }

 static _OutputArray _none;

--- a/modules/core/src/opengl_interop.cpp
+++ b/modules/core/src/opengl_interop.cpp
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
--- a/modules/gpu/doc/initalization_and_information.rst
+++ b/modules/gpu/doc/initalization_and_information.rst
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -47,7 +47,7 @@
 #  error gpu.hpp header must be compiled as C++
 #endif

-#include "opencv2/core/gpumat.hpp"
+#include "opencv2/core/gpu.hpp"

 #if !defined(__OPENCV_BUILD) && !defined(OPENCV_GPU_SKIP_INCLUDE)
    #include "opencv2/opencv_modules.hpp"

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
--- a/modules/gpu/test/test_opengl.cpp
+++ b/modules/gpu/test/test_opengl.cpp
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
--- a/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
+++ b/modules/gpubgsegm/include/opencv2/gpubgsegm.hpp
--- a/modules/gpubgsegm/src/cuda/fgd.hpp
+++ b/modules/gpubgsegm/src/cuda/fgd.hpp
--- a/modules/gpubgsegm/src/gmg.cpp
+++ b/modules/gpubgsegm/src/gmg.cpp
--- a/modules/gpubgsegm/src/precomp.hpp
+++ b/modules/gpubgsegm/src/precomp.hpp
--- a/modules/gpucodec/include/opencv2/gpucodec.hpp
+++ b/modules/gpucodec/include/opencv2/gpucodec.hpp
--- a/modules/gpucodec/src/cuvid_video_source.h
+++ b/modules/gpucodec/src/cuvid_video_source.h
--- a/modules/gpucodec/src/frame_queue.h
+++ b/modules/gpucodec/src/frame_queue.h
--- a/modules/gpucodec/src/precomp.hpp
+++ b/modules/gpucodec/src/precomp.hpp
--- a/modules/gpucodec/src/video_decoder.h
+++ b/modules/gpucodec/src/video_decoder.h
--- a/modules/gpucodec/src/video_parser.h
+++ b/modules/gpucodec/src/video_parser.h
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
--- a/modules/gpufeatures2d/src/brute_force_matcher.cpp
+++ b/modules/gpufeatures2d/src/brute_force_matcher.cpp
--- a/modules/gpufeatures2d/src/precomp.hpp
+++ b/modules/gpufeatures2d/src/precomp.hpp
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
--- a/modules/gpufilters/src/precomp.hpp
+++ b/modules/gpufilters/src/precomp.hpp
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
--- a/modules/gpuimgproc/src/precomp.hpp
+++ b/modules/gpuimgproc/src/precomp.hpp
--- a/modules/gpulegacy/include/opencv2/gpulegacy/private.hpp
+++ b/modules/gpulegacy/include/opencv2/gpulegacy/private.hpp
--- a/modules/gpulegacy/src/precomp.hpp
+++ b/modules/gpulegacy/src/precomp.hpp
--- a/modules/gpulegacy/test/test_precomp.hpp
+++ b/modules/gpulegacy/test/test_precomp.hpp
--- a/modules/gpuoptflow/include/opencv2/gpuoptflow.hpp
+++ b/modules/gpuoptflow/include/opencv2/gpuoptflow.hpp
--- a/modules/gpuoptflow/src/farneback.cpp
+++ b/modules/gpuoptflow/src/farneback.cpp
--- a/modules/gpuoptflow/src/precomp.hpp
+++ b/modules/gpuoptflow/src/precomp.hpp
--- a/modules/gpuoptflow/test/test_optflow.cpp
+++ b/modules/gpuoptflow/test/test_optflow.cpp
--- a/modules/gpustereo/include/opencv2/gpustereo.hpp
+++ b/modules/gpustereo/include/opencv2/gpustereo.hpp
--- a/modules/gpustereo/src/disparity_bilateral_filter.cpp
+++ b/modules/gpustereo/src/disparity_bilateral_filter.cpp
--- a/modules/gpustereo/src/precomp.hpp
+++ b/modules/gpustereo/src/precomp.hpp
--- a/modules/gpustereo/src/stereobm.cpp
+++ b/modules/gpustereo/src/stereobm.cpp
--- a/modules/gpustereo/src/stereobp.cpp
+++ b/modules/gpustereo/src/stereobp.cpp
--- a/modules/gpustereo/src/stereocsbp.cpp
+++ b/modules/gpustereo/src/stereocsbp.cpp
--- a/modules/gpuwarping/include/opencv2/gpuwarping.hpp
+++ b/modules/gpuwarping/include/opencv2/gpuwarping.hpp
--- a/modules/gpuwarping/src/precomp.hpp
+++ b/modules/gpuwarping/src/precomp.hpp
--- a/modules/gpuwarping/src/pyramids.cpp
+++ b/modules/gpuwarping/src/pyramids.cpp
--- a/modules/gpuwarping/src/resize.cpp
+++ b/modules/gpuwarping/src/resize.cpp
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
--- a/modules/nonfree/include/opencv2/nonfree/gpu.hpp
+++ b/modules/nonfree/include/opencv2/nonfree/gpu.hpp
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
--- a/modules/photo/include/opencv2/photo/gpu.hpp
+++ b/modules/photo/include/opencv2/photo/gpu.hpp
--- a/modules/photo/src/denoising_gpu.cpp
+++ b/modules/photo/src/denoising_gpu.cpp
--- a/modules/softcascade/include/opencv2/softcascade.hpp
+++ b/modules/softcascade/include/opencv2/softcascade.hpp
--- a/modules/softcascade/src/cuda/channels.cu
+++ b/modules/softcascade/src/cuda/channels.cu
--- a/modules/softcascade/src/cuda_invoker.hpp
+++ b/modules/softcascade/src/cuda_invoker.hpp
--- a/modules/softcascade/src/detector_cuda.cpp
+++ b/modules/softcascade/src/detector_cuda.cpp
--- a/modules/softcascade/src/precomp.hpp
+++ b/modules/softcascade/src/precomp.hpp
--- a/modules/softcascade/test/test_cuda_softcascade.cpp
+++ b/modules/softcascade/test/test_cuda_softcascade.cpp
--- a/modules/softcascade/test/utility.hpp
+++ b/modules/softcascade/test/utility.hpp
--- a/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/warpers.hpp
--- a/modules/superres/perf/perf_precomp.hpp
+++ b/modules/superres/perf/perf_precomp.hpp
--- a/modules/superres/src/input_array_utility.cpp
+++ b/modules/superres/src/input_array_utility.cpp
--- a/modules/superres/src/input_array_utility.hpp
+++ b/modules/superres/src/input_array_utility.hpp
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
--- a/modules/ts/include/opencv2/ts/gpu_test.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
--- a/modules/ts/src/gpu_perf.cpp
+++ b/modules/ts/src/gpu_perf.cpp
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
--- a/modules/videostab/include/opencv2/videostab/wobble_suppression.hpp
+++ b/modules/videostab/include/opencv2/videostab/wobble_suppression.hpp
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
--- a/samples/gpu/multi.cpp
+++ b/samples/gpu/multi.cpp
--- a/samples/gpu/opengl.cpp
+++ b/samples/gpu/opengl.cpp
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp