fixed bugs in page locked memory allocation

avoid extra gpu memory allocation in BP and CSBP

fixed bugs in page locked memory allocation
avoid extra gpu memory allocation in BP and CSBP
9a669b1c · Anatoly Baksheev · ba713f28 · 9a669b1c · 9a669b1c · 9a669b1c
6 changed file
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -68,7 +68,7 @@ namespace cv

        //////////////////////////////// GpuMat ////////////////////////////////
        class Stream;
-        class MatPL;
+        class CudaMem;

        //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
        class CV_EXPORTS GpuMat
@@ -111,12 +111,16 @@ namespace cv

            //! pefroms blocking upload data to GpuMat. .
            void upload(const cv::Mat& m);
-            void upload(const MatPL& m, Stream& stream);

-            //! Downloads data from device to host memory. Blocking calls.
+            //! upload async
+            void upload(const CudaMem& m, Stream& stream);
+
+            //! downloads data from device to host memory. Blocking calls.
            operator Mat() const;
            void download(cv::Mat& m) const;
-            void download(MatPL& m, Stream& stream) const;
+
+            //! download async
+            void download(CudaMem& m, Stream& stream) const;

            //! returns a new GpuMatrix header for the specified row
            GpuMat row(int y) const;
@@ -223,52 +227,50 @@ namespace cv
            uchar* dataend;
        };

-        //////////////////////////////// MatPL ////////////////////////////////
-        // MatPL is limited cv::Mat with page locked memory allocation.
+        //////////////////////////////// CudaMem ////////////////////////////////
+        // CudaMem is limited cv::Mat with page locked memory allocation.
        // Page locked memory is only needed for async and faster coping to GPU.
        // It is convertable to cv::Mat header without reference counting
        // so you can use it with other opencv functions.

-        class CV_EXPORTS MatPL
+        class CV_EXPORTS CudaMem
        {
-        public:
+        public:            
+            enum  { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 };

-            //Supported.  Now behaviour is like ALLOC_DEFAULT.
-            enum  { ALLOC_PAGE_LOCKED = 0, ALLOC_ZEROCOPY = 1, ALLOC_WRITE_COMBINED = 4 };
+            CudaMem();
+            CudaMem(const CudaMem& m);

-            MatPL();
-            MatPL(const MatPL& m);
-
-            MatPL(int _rows, int _cols, int _type, int type_alloc = ALLOC_PAGE_LOCKED);
-            MatPL(Size _size, int _type, int type_alloc = ALLOC_PAGE_LOCKED);
+            CudaMem(int _rows, int _cols, int _type, int _alloc_type = ALLOC_PAGE_LOCKED);
+            CudaMem(Size _size, int _type, int _alloc_type = ALLOC_PAGE_LOCKED);


            //! creates from cv::Mat with coping data
-            explicit MatPL(const Mat& m, int type_alloc = ALLOC_PAGE_LOCKED);
+            explicit CudaMem(const Mat& m, int _alloc_type = ALLOC_PAGE_LOCKED);

-            ~MatPL();
+            ~CudaMem();

-            MatPL& operator = (const MatPL& m);
+            CudaMem& operator = (const CudaMem& m);

            //! returns deep copy of the matrix, i.e. the data is copied
-            MatPL clone() const;
+            CudaMem clone() const;

            //! allocates new matrix data unless the matrix already has specified size and type.
-            void create(int _rows, int _cols, int _type, int type_alloc = ALLOC_PAGE_LOCKED);
-            void create(Size _size, int _type, int type_alloc = ALLOC_PAGE_LOCKED);
+            void create(int _rows, int _cols, int _type, int _alloc_type = ALLOC_PAGE_LOCKED);
+            void create(Size _size, int _type, int _alloc_type = ALLOC_PAGE_LOCKED);

            //! decrements reference counter and released memory if needed.
            void release();

-            //! returns matrix header with disabled reference counting for MatPL data.
+            //! returns matrix header with disabled reference counting for CudaMem data.
            Mat createMatHeader() const;
            operator Mat() const;

            operator GpuMat() const;

+            //returns if host memory can be mapperd to gpu address space;
            static bool can_device_map_to_host();

-
            // Please see cv::Mat for descriptions
            bool isContinuous() const;
            size_t elemSize() const;
@@ -314,13 +316,13 @@ namespace cv
            void waitForCompletion();

            //! downloads asynchronously.
-            // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its subMat)
-            void enqueueDownload(const GpuMat& src, MatPL& dst);
+            // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat)
+            void enqueueDownload(const GpuMat& src, CudaMem& dst);
            void enqueueDownload(const GpuMat& src, Mat& dst);

            //! uploads asynchronously.
-            // Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its ROI)
-            void enqueueUpload(const MatPL& src, GpuMat& dst);
+            // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI)
+            void enqueueUpload(const CudaMem& src, GpuMat& dst);
            void enqueueUpload(const Mat& src, GpuMat& dst);

            void enqueueCopy(const GpuMat& src, GpuMat& dst);

--- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
@@ -339,43 +339,43 @@ static inline void swap( GpuMat& a, GpuMat& b ) { a.swap(b); }


 ///////////////////////////////////////////////////////////////////////
-//////////////////////////////// MatPL ////////////////////////////////
+//////////////////////////////// CudaMem ////////////////////////////////
 ///////////////////////////////////////////////////////////////////////

-inline MatPL::MatPL()  : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) {}
-inline MatPL::MatPL(int _rows, int _cols, int _type, int type_alloc) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+inline CudaMem::CudaMem()  : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {}
+inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
 {
    if( _rows > 0 && _cols > 0 )
-        create( _rows, _cols, _type , type_alloc);
+        create( _rows, _cols, _type, _alloc_type);
 }

-inline MatPL::MatPL(Size _size, int _type, int type_alloc) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
 {
    if( _size.height > 0 && _size.width > 0 )
-        create( _size.height, _size.width, _type, type_alloc );
+        create( _size.height, _size.width, _type, _alloc_type);
 }

-inline MatPL::MatPL(const MatPL& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(0), dataend(0)
+inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
 {
    if( refcount )
        CV_XADD(refcount, 1);
 }

-inline MatPL::MatPL(const Mat& m, int type_alloc) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
 {
    if( m.rows > 0 && m.cols > 0 )
-        create( m.size(), m.type() , type_alloc);
+        create( m.size(), m.type(), _alloc_type);

    Mat tmp = createMatHeader();
    m.copyTo(tmp);
 }

-inline MatPL::~MatPL()
+inline CudaMem::~CudaMem()
 {
    release();
 }

-inline MatPL& MatPL::operator = (const MatPL& m)
+inline CudaMem& CudaMem::operator = (const CudaMem& m)
 {
    if( this != &m )
    {
@@ -393,31 +393,31 @@ inline MatPL& MatPL::operator = (const MatPL& m)
    return *this;
 }

-inline MatPL MatPL::clone() const
+inline CudaMem CudaMem::clone() const
 {
-    MatPL m(size(), type());
+    CudaMem m(size(), type(), alloc_type);
    Mat to = m;
    Mat from = *this;
    from.copyTo(to);
    return m;
 }

-inline void MatPL::create(Size _size, int _type, int type_alloc) { create(_size.height, _size.width, _type, type_alloc); }
-//CCP void MatPL::create(int _rows, int _cols, int _type);
-//CPP void MatPL::release();
-
-inline Mat MatPL::createMatHeader() const { return Mat(size(), type(), data); }
-inline MatPL::operator Mat() const { return createMatHeader(); }
-
-inline bool MatPL::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
-inline size_t MatPL::elemSize() const { return CV_ELEM_SIZE(flags); }
-inline size_t MatPL::elemSize1() const { return CV_ELEM_SIZE1(flags); }
-inline int MatPL::type() const { return CV_MAT_TYPE(flags); }
-inline int MatPL::depth() const { return CV_MAT_DEPTH(flags); }
-inline int MatPL::channels() const { return CV_MAT_CN(flags); }
-inline size_t MatPL::step1() const { return step/elemSize1(); }
-inline Size MatPL::size() const { return Size(cols, rows); }
-inline bool MatPL::empty() const { return data == 0; }
+inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); }
+//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type);
+//CPP void CudaMem::release();
+
+inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data); }
+inline CudaMem::operator Mat() const { return createMatHeader(); }
+
+inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
+inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); }
+inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); }
+inline int CudaMem::type() const { return CV_MAT_TYPE(flags); }
+inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); }
+inline int CudaMem::channels() const { return CV_MAT_CN(flags); }
+inline size_t CudaMem::step1() const { return step/elemSize1(); }
+inline Size CudaMem::size() const { return Size(cols, rows); }
+inline bool CudaMem::empty() const { return data == 0; }


 } /* end of namespace gpu */

--- a/modules/gpu/src/beliefpropagation_gpu.cpp
+++ b/modules/gpu/src/beliefpropagation_gpu.cpp
@@ -234,7 +234,7 @@ namespace
            if (disp.empty())
                disp.create(rows, cols, CV_16S);

-            out = ((disp.type() == CV_16S) ? disp : GpuMat(rows, cols, CV_16S));
+            out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
            out = zero;

            bp::output(rthis.msg_type, u, d, l, r, datas.front(), disp, stream);

--- a/modules/gpu/src/constantspacebp_gpu.cpp
+++ b/modules/gpu/src/constantspacebp_gpu.cpp
@@ -251,7 +251,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    if (disp.empty())
        disp.create(rows, cols, CV_16S);

-    out = ((disp.type() == CV_16S) ? disp : GpuMat(rows, cols, CV_16S));
+    out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out));
    out = zero;

    csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),

--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -57,8 +57,8 @@ Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); re
 bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
 void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
 void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, MatPL& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const MatPL& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
 void cv::gpu::Stream::enqueueMemSet(const GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
@@ -150,9 +150,9 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
    devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
 }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }

-void cv::gpu::Stream::enqueueUpload(const MatPL& src, GpuMat& dst){ devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
 void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, impl->stream,   cudaMemcpyHostToDevice); }
 void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToDevice); }


--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -67,9 +67,9 @@ namespace cv
        void GpuMat::create(int /*_rows*/, int /*_cols*/, int /*_type*/) { throw_nogpu(); }
        void GpuMat::release() { throw_nogpu(); }

-        void MatPL::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
-        bool MatPL::can_device_map_to_host() { throw_nogpu(); return false; }
-        void MatPL::release() { throw_nogpu(); }
+        void CudaMem::create(int /*_rows*/, int /*_cols*/, int /*_type*/, int /*type_alloc*/) { throw_nogpu(); }
+        bool CudaMem::can_device_map_to_host() { throw_nogpu(); return false; }
+        void CudaMem::release() { throw_nogpu(); }
    }

 }
@@ -83,7 +83,7 @@ void cv::gpu::GpuMat::upload(const Mat& m)
    cudaSafeCall( cudaMemcpy2D(data, step, m.data, m.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) );
 }

-void cv::gpu::GpuMat::upload(const MatPL& m, Stream& stream)
+void cv::gpu::GpuMat::upload(const CudaMem& m, Stream& stream)
 {
    CV_DbgAssert(!m.empty());
    stream.enqueueUpload(m, *this);
@@ -96,7 +96,7 @@ void cv::gpu::GpuMat::download(cv::Mat& m) const
    cudaSafeCall( cudaMemcpy2D(m.data, m.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) );
 }

-void cv::gpu::GpuMat::download(MatPL& m, Stream& stream) const
+void cv::gpu::GpuMat::download(CudaMem& m, Stream& stream) const
 {
    CV_DbgAssert(!m.empty());
    stream.enqueueDownload(*this, m);
@@ -210,15 +210,6 @@ GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
    return hdr;
 }

-bool cv::gpu::MatPL::can_device_map_to_host()
-{
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, 0);
-
-        return (prop.canMapHostMemory != 0) ? true : false;
-}
-
-
 void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
 {
    _type &= TYPE_MASK;
@@ -266,12 +257,21 @@ void cv::gpu::GpuMat::release()


 ///////////////////////////////////////////////////////////////////////
-//////////////////////////////// MatPL ////////////////////////////////
+//////////////////////////////// CudaMem //////////////////////////////
 ///////////////////////////////////////////////////////////////////////

-void cv::gpu::MatPL::create(int _rows, int _cols, int _type, int type_alloc)
+bool cv::gpu::CudaMem::can_device_map_to_host()
 {
-    alloc_type = type_alloc;
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    return (prop.canMapHostMemory != 0) ? true : false;
+}
+
+void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
+{   
+    if (_alloc_type == ALLOC_ZEROCOPY && !can_device_map_to_host())
+            cv::gpu::error("ZeroCopy is not supported by current device", __FILE__, __LINE__);
+
    _type &= TYPE_MASK;
    if( rows == _rows && cols == _cols && type() == _type && data )
        return;
@@ -279,7 +279,7 @@ void cv::gpu::MatPL::create(int _rows, int _cols, int _type, int type_alloc)
        release();
    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
    if( _rows > 0 && _cols > 0 )
-    {
+    {        
        flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + _type;
        rows = _rows;
        cols = _cols;
@@ -291,24 +291,15 @@ void cv::gpu::MatPL::create(int _rows, int _cols, int _type, int type_alloc)
        size_t datasize = alignSize(nettosize, (int)sizeof(*refcount));

        //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount));
+        alloc_type = _alloc_type;
        void *ptr;
-
-        switch (type_alloc)
+        
+        switch (alloc_type)
        {
-            case ALLOC_PAGE_LOCKED:  cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
-            case ALLOC_ZEROCOPY:
-                if (can_device_map_to_host() == true)
-                {
-                    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );
-                }
-                else
-                    cv::gpu::error("ZeroCopy is not supported by current device", __FILE__, __LINE__);
-                break;
-
+            case ALLOC_PAGE_LOCKED:    cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break;
+            case ALLOC_ZEROCOPY:       cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) );  break;
            case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break;
-
-            default:
-                cv::gpu::error("Invalid alloc type", __FILE__, __LINE__);
+            default: cv::gpu::error("Invalid alloc type", __FILE__, __LINE__);
        }

        datastart = data =  (uchar*)ptr;
@@ -319,20 +310,22 @@ void cv::gpu::MatPL::create(int _rows, int _cols, int _type, int type_alloc)
    }
 }

-inline MatPL::operator GpuMat() const
+inline CudaMem::operator GpuMat() const
 {
+    GpuMat res;
    if (alloc_type == ALLOC_ZEROCOPY)
    {
-        void ** pdev;
-        cudaHostGetDevicePointer( pdev, this->data, 0 );
-        GpuMat m(this->rows, this->cols, this->type(), *pdev, this->step);
-        return m;
+        void *pdev;
+        cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) );
+        res = GpuMat(rows, cols, type(), pdev, step);
    }
    else
-        cv::gpu::error("", __FILE__, __LINE__);
+        cv::gpu::error("Zero-copy is not supported or memory was allocated without zero-copy flag", __FILE__, __LINE__);
+        
+    return res;
 }

-void cv::gpu::MatPL::release()
+void cv::gpu::CudaMem::release()
 {
    if( refcount && CV_XADD(refcount, -1) == 1 )
    {