diff --git a/modules/core/include/opencv2/core/core.hpp b/modules/core/include/opencv2/core/core.hpp index b61871262b6e3189d868421d32d55215674b37bd..d7d98d0b081e41d84e2423eb3ebcf2a09320adcb 100644 --- a/modules/core/include/opencv2/core/core.hpp +++ b/modules/core/include/opencv2/core/core.hpp @@ -90,6 +90,10 @@ class Mat; class SparseMat; typedef Mat MatND; +namespace gpu { + class GpuMat; +} + class CV_EXPORTS MatExpr; class CV_EXPORTS MatOp_Base; class CV_EXPORTS MatArg; @@ -1627,6 +1631,10 @@ public: template explicit Mat(const Point3_<_Tp>& pt, bool copyData=true); //! builds matrix from comma initializer template explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer); + + //! download data from GpuMat + explicit Mat(const gpu::GpuMat& m); + //! destructor - calls release() ~Mat(); //! assignment operators diff --git a/modules/core/include/opencv2/core/devmem2d.hpp b/modules/core/include/opencv2/core/devmem2d.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6ab70c5fe26a9e31d770ddc98d291f689a220caf --- /dev/null +++ b/modules/core/include/opencv2/core/devmem2d.hpp @@ -0,0 +1,157 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_CORE_DevMem2D_HPP__ +#define __OPENCV_CORE_DevMem2D_HPP__ + +#ifdef __CUDACC__ + #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ +#else + #define __CV_GPU_HOST_DEVICE__ +#endif + +namespace cv +{ + namespace gpu + { + // Simple lightweight structures that encapsulates information about an image on device. + // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile + + template struct StaticAssert; + template <> struct StaticAssert {static __CV_GPU_HOST_DEVICE__ void check(){}}; + + template struct DevPtr + { + typedef T elem_type; + typedef int index_type; + + enum { elem_size = sizeof(elem_type) }; + + T* data; + + __CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {} + __CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {} + + __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; } + __CV_GPU_HOST_DEVICE__ operator T*() { return data; } + __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; } + }; + + template struct PtrSz : public DevPtr + { + __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {} + __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr(data_), size(size_) {} + + size_t size; + }; + + template struct PtrStep : public DevPtr + { + __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {} + __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr(data_), step(step_) {} + + /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */ + size_t step; + + __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr::data + y * step); } + __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr::data + y * step); } + + __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; } + __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; } + }; + + template struct PtrStepSz : public PtrStep + { + __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {} + __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) + : PtrStep(data_, step_), cols(cols_), rows(rows_) {} + + int cols; + int rows; + }; + + template struct DevMem2D_ : public PtrStepSz + { + DevMem2D_() {} + DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz(rows_, cols_, data_, step_) {} + + template + explicit DevMem2D_(const DevMem2D_& d) : PtrStepSz(d.rows, d.cols, (T*)d.data, d.step) {} + }; + + template struct PtrElemStep_ : public PtrStep + { + PtrElemStep_(const DevMem2D_& mem) : PtrStep(mem.data, mem.step) + { + StaticAssert<256 % sizeof(T) == 0>::check(); + + PtrStep::step /= PtrStep::elem_size; + } + __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep::data + y * PtrStep::step; } + __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep::data + y * PtrStep::step; } + + __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; } + __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; } + }; + + template struct PtrStep_ : public PtrStep + { + PtrStep_() {} + PtrStep_(const DevMem2D_& mem) : PtrStep(mem.data, mem.step) {} + }; + + typedef DevMem2D_ DevMem2Db; + typedef DevMem2Db DevMem2D; + typedef DevMem2D_ DevMem2Df; + typedef DevMem2D_ DevMem2Di; + + typedef PtrStep PtrStepb; + typedef PtrStep PtrStepf; + typedef PtrStep PtrStepi; + + typedef PtrElemStep_ PtrElemStep; + typedef PtrElemStep_ PtrElemStepf; + typedef PtrElemStep_ PtrElemStepi; + } +} + +#endif /* __OPENCV_GPU_DevMem2D_HPP__ */ diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp new file mode 100644 index 0000000000000000000000000000000000000000..accfb7c696c28b2db09272e6c1582358095a999d --- /dev/null +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -0,0 +1,471 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_GPUMAT_HPP__ +#define __OPENCV_GPUMAT_HPP__ + +#include "opencv2/core/core.hpp" +#include "opencv2/core/devmem2d.hpp" + +namespace cv { namespace gpu +{ + //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. + class CV_EXPORTS GpuMat + { + public: + //! default constructor + GpuMat(); + + //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.) + GpuMat(int rows, int cols, int type); + GpuMat(Size size, int type); + + //! constucts GpuMatrix and fills it with the specified value _s. + GpuMat(int rows, int cols, int type, Scalar s); + GpuMat(Size size, int type, Scalar s); + + //! copy constructor + GpuMat(const GpuMat& m); + + //! constructor for GpuMatrix headers pointing to user-allocated data + GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP); + GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP); + + //! creates a matrix header for a part of the bigger matrix + GpuMat(const GpuMat& m, Range rowRange, Range colRange); + GpuMat(const GpuMat& m, Rect roi); + + //! builds GpuMat from Mat. Perfom blocking upload to device. + explicit GpuMat(const Mat& m); + + //! destructor - calls release() + ~GpuMat(); + + //! assignment operators + GpuMat& operator = (const GpuMat& m); + + //! pefroms blocking upload data to GpuMat. + void upload(const Mat& m); + + //! downloads data from device to host memory. Blocking calls. + void download(Mat& m) const; + + //! returns a new GpuMatrix header for the specified row + GpuMat row(int y) const; + //! returns a new GpuMatrix header for the specified column + GpuMat col(int x) const; + //! ... for the specified row span + GpuMat rowRange(int startrow, int endrow) const; + GpuMat rowRange(Range r) const; + //! ... for the specified column span + GpuMat colRange(int startcol, int endcol) const; + GpuMat colRange(Range r) const; + + //! returns deep copy of the GpuMatrix, i.e. the data is copied + GpuMat clone() const; + //! copies the GpuMatrix content to "m". + // It calls m.create(this->size(), this->type()). + void copyTo(GpuMat& m) const; + //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements. + void copyTo(GpuMat& m, const GpuMat& mask) const; + //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale. + void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const; + + void assignTo(GpuMat& m, int type=-1) const; + + //! sets every GpuMatrix element to s + GpuMat& operator = (Scalar s); + //! sets some of the GpuMatrix elements to s, according to the mask + GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat()); + //! creates alternative GpuMatrix header for the same data, with different + // number of channels and/or different number of rows. see cvReshape. + GpuMat reshape(int cn, int rows = 0) const; + + //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type. + // previous data is unreferenced if needed. + void create(int rows, int cols, int type); + void create(Size size, int type); + //! decreases reference counter; + // deallocate the data when reference counter reaches 0. + void release(); + + //! swaps with other smart pointer + void swap(GpuMat& mat); + + //! locates GpuMatrix header within a parent GpuMatrix. See below + void locateROI(Size& wholeSize, Point& ofs) const; + //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix. + GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright); + //! extracts a rectangular sub-GpuMatrix + // (this is a generalized form of row, rowRange etc.) + GpuMat operator()(Range rowRange, Range colRange) const; + GpuMat operator()(Rect roi) const; + + //! returns true iff the GpuMatrix data is continuous + // (i.e. when there are no gaps between successive rows). + // similar to CV_IS_GpuMat_CONT(cvGpuMat->type) + bool isContinuous() const; + //! returns element size in bytes, + // similar to CV_ELEM_SIZE(cvMat->type) + size_t elemSize() const; + //! returns the size of element channel in bytes. + size_t elemSize1() const; + //! returns element type, similar to CV_MAT_TYPE(cvMat->type) + int type() const; + //! returns element type, similar to CV_MAT_DEPTH(cvMat->type) + int depth() const; + //! returns element type, similar to CV_MAT_CN(cvMat->type) + int channels() const; + //! returns step/elemSize1() + size_t step1() const; + //! returns GpuMatrix size: + // width == number of columns, height == number of rows + Size size() const; + //! returns true if GpuMatrix data is NULL + bool empty() const; + + //! returns pointer to y-th row + uchar* ptr(int y = 0); + const uchar* ptr(int y = 0) const; + + //! template version of the above method + template _Tp* ptr(int y = 0); + template const _Tp* ptr(int y = 0) const; + + template operator DevMem2D_<_Tp>() const; + template operator PtrStep_<_Tp>() const; + + /*! includes several bit-fields: + - the magic signature + - continuity flag + - depth + - number of channels + */ + int flags; + + //! the number of rows and columns + int rows, cols; + + //! a distance between successive rows in bytes; includes the gap if any + size_t step; + + //! pointer to the data + uchar* data; + + //! pointer to the reference counter; + // when GpuMatrix points to user-allocated data, the pointer is NULL + int* refcount; + + //! helper fields used in locateROI and adjustROI + uchar* datastart; + uchar* dataend; + }; + + //! Creates continuous GPU matrix + CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); + CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type); + CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m); + CV_EXPORTS GpuMat createContinuous(Size size, int type); + + //! Ensures that size of the given matrix is not less than (rows, cols) size + //! and matrix type is match specified one too + CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); + CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m); + + class CV_EXPORTS GpuFuncTable + { + public: + virtual ~GpuFuncTable() {} + + virtual void copy(const Mat& src, GpuMat& dst) const = 0; + virtual void copy(const GpuMat& src, Mat& dst) const = 0; + virtual void copy(const GpuMat& src, GpuMat& dst) const = 0; + + virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0; + + virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; + virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0; + + virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0; + + virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; + virtual void free(void* devPtr) const = 0; + }; + + CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl); + + //////////////////////////////////////////////////////////////////////// + + inline GpuMat::GpuMat() + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) + { + } + + inline GpuMat::GpuMat(int rows_, int cols_, int type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) + { + if (rows_ > 0 && cols_ > 0) + create(rows_, cols_, type_); + } + + inline GpuMat::GpuMat(Size size_, int type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) + { + if (size_.height > 0 && size_.width > 0) + create(size_.height, size_.width, type_); + } + + inline GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) + { + if (rows_ > 0 && cols_ > 0) + { + create(rows_, cols_, type_); + setTo(s_); + } + } + + inline GpuMat::GpuMat(Size size_, int type_, Scalar s_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) + { + if (size_.height > 0 && size_.width > 0) + { + create(size_.height, size_.width, type_); + setTo(s_); + } + } + + inline GpuMat::~GpuMat() + { + release(); + } + + inline GpuMat GpuMat::clone() const + { + GpuMat m; + copyTo(m); + return m; + } + + inline void GpuMat::assignTo(GpuMat& m, int type) const + { + if (type < 0) + m = *this; + else + convertTo(m, type); + } + + inline size_t GpuMat::step1() const + { + return step / elemSize1(); + } + + inline bool GpuMat::empty() const + { + return data == 0; + } + + template inline _Tp* GpuMat::ptr(int y) + { + return (_Tp*)ptr(y); + } + + template inline const _Tp* GpuMat::ptr(int y) const + { + return (const _Tp*)ptr(y); + } + + inline void swap(GpuMat& a, GpuMat& b) + { + a.swap(b); + } + + inline GpuMat GpuMat::row(int y) const + { + return GpuMat(*this, Range(y, y+1), Range::all()); + } + + inline GpuMat GpuMat::col(int x) const + { + return GpuMat(*this, Range::all(), Range(x, x+1)); + } + + inline GpuMat GpuMat::rowRange(int startrow, int endrow) const + { + return GpuMat(*this, Range(startrow, endrow), Range::all()); + } + + inline GpuMat GpuMat::rowRange(Range r) const + { + return GpuMat(*this, r, Range::all()); + } + + inline GpuMat GpuMat::colRange(int startcol, int endcol) const + { + return GpuMat(*this, Range::all(), Range(startcol, endcol)); + } + + inline GpuMat GpuMat::colRange(Range r) const + { + return GpuMat(*this, Range::all(), r); + } + + inline void GpuMat::create(Size size_, int type_) + { + create(size_.height, size_.width, type_); + } + + inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const + { + return GpuMat(*this, rowRange, colRange); + } + + inline GpuMat GpuMat::operator()(Rect roi) const + { + return GpuMat(*this, roi); + } + + inline bool GpuMat::isContinuous() const + { + return (flags & Mat::CONTINUOUS_FLAG) != 0; + } + + inline size_t GpuMat::elemSize() const + { + return CV_ELEM_SIZE(flags); + } + + inline size_t GpuMat::elemSize1() const + { + return CV_ELEM_SIZE1(flags); + } + + inline int GpuMat::type() const + { + return CV_MAT_TYPE(flags); + } + + inline int GpuMat::depth() const + { + return CV_MAT_DEPTH(flags); + } + + inline int GpuMat::channels() const + { + return CV_MAT_CN(flags); + } + + inline Size GpuMat::size() const + { + return Size(cols, rows); + } + + inline uchar* GpuMat::ptr(int y) + { + CV_DbgAssert((unsigned)y < (unsigned)rows); + return data + step * y; + } + + inline const uchar* GpuMat::ptr(int y) const + { + CV_DbgAssert((unsigned)y < (unsigned)rows); + return data + step * y; + } + + inline GpuMat& GpuMat::operator = (Scalar s) + { + setTo(s); + return *this; + } + + template inline GpuMat::operator DevMem2D_() const + { + return DevMem2D_(rows, cols, (T*)data, step); + } + + template inline GpuMat::operator PtrStep_() const + { + return PtrStep_(static_cast< DevMem2D_ >(*this)); + } + + inline GpuMat createContinuous(int rows, int cols, int type) + { + GpuMat m; + createContinuous(rows, cols, type, m); + return m; + } + + inline void createContinuous(Size size, int type, GpuMat& m) + { + createContinuous(size.height, size.width, type, m); + } + + inline GpuMat createContinuous(Size size, int type) + { + GpuMat m; + createContinuous(size, type, m); + return m; + } + + inline void ensureSizeIsEnough(Size size, int type, GpuMat& m) + { + ensureSizeIsEnough(size.height, size.width, type, m); + } + + inline void createContinuous(int rows, int cols, int type, GpuMat& m) + { + int area = rows * cols; + if (!m.isContinuous() || m.type() != type || m.size().area() != area) + m.create(1, area, type); + m = m.reshape(0, rows); + } + + inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) + { + if (m.type() == type && m.rows >= rows && m.cols >= cols) + m = m(Rect(0, 0, cols, rows)); + else + m.create(rows, cols, type); + } +}} + +#endif // __OPENCV_GPUMAT_HPP__ diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2dffee4c28239da85bfa947ac762c9cda8d5a822 --- /dev/null +++ b/modules/core/src/gpumat.cpp @@ -0,0 +1,460 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include "opencv2/core/gpumat.hpp" + +using namespace std; +using namespace cv; +using namespace cv::gpu; + +cv::gpu::GpuMat::GpuMat(const GpuMat& m) + : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend) +{ + if (refcount) + CV_XADD(refcount, 1); +} + +cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : + flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_), + step(step_), data((uchar*)data_), refcount(0), + datastart((uchar*)data_), dataend((uchar*)data_) +{ + size_t minstep = cols * elemSize(); + + if (step == Mat::AUTO_STEP) + { + step = minstep; + flags |= Mat::CONTINUOUS_FLAG; + } + else + { + if (rows == 1) + step = minstep; + + CV_DbgAssert(step >= minstep); + + flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; + } + dataend += step * (rows - 1) + minstep; +} + +cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : + flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width), + step(step_), data((uchar*)data_), refcount(0), + datastart((uchar*)data_), dataend((uchar*)data_) +{ + size_t minstep = cols * elemSize(); + + if (step == Mat::AUTO_STEP) + { + step = minstep; + flags |= Mat::CONTINUOUS_FLAG; + } + else + { + if (rows == 1) + step = minstep; + + CV_DbgAssert(step >= minstep); + + flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0; + } + dataend += step * (rows - 1) + minstep; +} + +cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange, Range colRange) +{ + flags = m.flags; + step = m.step; refcount = m.refcount; + data = m.data; datastart = m.datastart; dataend = m.dataend; + + if (rowRange == Range::all()) + rows = m.rows; + else + { + CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows); + + rows = rowRange.size(); + data += step*rowRange.start; + } + + if (colRange == Range::all()) + cols = m.cols; + else + { + CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols); + + cols = colRange.size(); + data += colRange.start*elemSize(); + flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; + } + + if (rows == 1) + flags |= Mat::CONTINUOUS_FLAG; + + if (refcount) + CV_XADD(refcount, 1); + + if (rows <= 0 || cols <= 0) + rows = cols = 0; +} + +cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : + flags(m.flags), rows(roi.height), cols(roi.width), + step(m.step), data(m.data + roi.y*step), refcount(m.refcount), + datastart(m.datastart), dataend(m.dataend) +{ + flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1; + data += roi.x * elemSize(); + + CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows); + + if (refcount) + CV_XADD(refcount, 1); + + if (rows <= 0 || cols <= 0) + rows = cols = 0; +} + +cv::gpu::GpuMat::GpuMat(const Mat& m) : + flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) +{ + upload(m); +} + +GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m) +{ + if (this != &m) + { + GpuMat temp(m); + swap(temp); + } + + return *this; +} + +void cv::gpu::GpuMat::swap(GpuMat& b) +{ + std::swap(flags, b.flags); + std::swap(rows, b.rows); + std::swap(cols, b.cols); + std::swap(step, b.step); + std::swap(data, b.data); + std::swap(datastart, b.datastart); + std::swap(dataend, b.dataend); + std::swap(refcount, b.refcount); +} + +void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const +{ + size_t esz = elemSize(); + ptrdiff_t delta1 = data - datastart; + ptrdiff_t delta2 = dataend - datastart; + + CV_DbgAssert(step > 0); + + if (delta1 == 0) + ofs.x = ofs.y = 0; + else + { + ofs.y = static_cast(delta1 / step); + ofs.x = static_cast((delta1 - step * ofs.y) / esz); + + CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz); + } + + size_t minstep = (ofs.x + cols) * esz; + + wholeSize.height = std::max(static_cast((delta2 - minstep) / step + 1), ofs.y + rows); + wholeSize.width = std::max(static_cast((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols); +} + +GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright) +{ + Size wholeSize; + Point ofs; + locateROI(wholeSize, ofs); + + size_t esz = elemSize(); + + int row1 = std::max(ofs.y - dtop, 0); + int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height); + + int col1 = std::max(ofs.x - dleft, 0); + int col2 = std::min(ofs.x + cols + dright, wholeSize.width); + + data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz; + rows = row2 - row1; + cols = col2 - col1; + + if (esz * cols == step || rows == 1) + flags |= Mat::CONTINUOUS_FLAG; + else + flags &= ~Mat::CONTINUOUS_FLAG; + + return *this; +} + +GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const +{ + GpuMat hdr = *this; + + int cn = channels(); + if (new_cn == 0) + new_cn = cn; + + int total_width = cols * cn; + + if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0) + new_rows = rows * total_width / new_cn; + + if (new_rows != 0 && new_rows != rows) + { + int total_size = total_width * rows; + + if (!isContinuous()) + CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed"); + + if ((unsigned)new_rows > (unsigned)total_size) + CV_Error(CV_StsOutOfRange, "Bad new number of rows"); + + total_width = total_size / new_rows; + + if (total_width * new_rows != total_size) + CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows"); + + hdr.rows = new_rows; + hdr.step = total_width * elemSize1(); + } + + int new_width = total_width / new_cn; + + if (new_width * new_cn != total_width) + CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels"); + + hdr.cols = new_width; + hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT); + + return hdr; +} + +cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows) +{ + m.download(*this); +} + +namespace +{ + void throw_nogpu() + { + CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); + } + + class EmptyFuncTable : public GpuFuncTable + { + public: + void copy(const Mat&, GpuMat&) const { throw_nogpu(); } + void copy(const GpuMat&, Mat&) const { throw_nogpu(); } + void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); } + + void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); } + + void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); } + void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); } + + void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu(); } + + void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); } + void free(void*) const {} + }; + + const GpuFuncTable* g_funcTbl = 0; + + const GpuFuncTable* gpuFuncTable() + { + static EmptyFuncTable empty; + return g_funcTbl ? g_funcTbl : ∅ + } +} + +void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl) +{ + g_funcTbl = funcTbl; +} + +void cv::gpu::GpuMat::upload(const Mat& m) +{ + CV_DbgAssert(!m.empty()); + + create(m.size(), m.type()); + + gpuFuncTable()->copy(m, *this); +} + +void cv::gpu::GpuMat::download(Mat& m) const +{ + CV_DbgAssert(!empty()); + + m.create(size(), type()); + + gpuFuncTable()->copy(*this, m); +} + +void cv::gpu::GpuMat::copyTo(GpuMat& m) const +{ + CV_DbgAssert(!empty()); + + m.create(size(), type()); + + gpuFuncTable()->copy(*this, m); +} + +void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const +{ + if (mask.empty()) + copyTo(mat); + else + { + mat.create(size(), type()); + + gpuFuncTable()->copyWithMask(*this, mat, mask); + } +} + +void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const +{ + bool noScale = fabs(alpha - 1) < numeric_limits::epsilon() && fabs(beta) < numeric_limits::epsilon(); + + if (rtype < 0) + rtype = type(); + else + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + + int sdepth = depth(); + int ddepth = CV_MAT_DEPTH(rtype); + if (sdepth == ddepth && noScale) + { + copyTo(dst); + return; + } + + GpuMat temp; + const GpuMat* psrc = this; + if (sdepth != ddepth && psrc == &dst) + { + temp = *this; + psrc = &temp; + } + + dst.create(size(), rtype); + + if (noScale) + gpuFuncTable()->convert(*psrc, dst); + else + gpuFuncTable()->convert(*psrc, dst, alpha, beta); +} + +GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask) +{ + CV_Assert(mask.empty() || mask.type() == CV_8UC1); + CV_DbgAssert(!empty()); + + gpuFuncTable()->setTo(*this, s, mask); + + return *this; +} + +void cv::gpu::GpuMat::create(int _rows, int _cols, int _type) +{ + _type &= TYPE_MASK; + + if (rows == _rows && cols == _cols && type() == _type && data) + return; + + if (data) + release(); + + CV_DbgAssert(_rows >= 0 && _cols >= 0); + + if (_rows > 0 && _cols > 0) + { + flags = Mat::MAGIC_VAL + _type; + rows = _rows; + cols = _cols; + + size_t esz = elemSize(); + + void* devPtr; + gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows); + + // Single row must be continuous + if (rows == 1) + step = esz * cols; + + if (esz * cols == step) + flags |= Mat::CONTINUOUS_FLAG; + + int64 _nettosize = static_cast(step) * rows; + size_t nettosize = static_cast(_nettosize); + + datastart = data = static_cast(devPtr); + dataend = data + nettosize; + + refcount = static_cast(fastMalloc(sizeof(*refcount))); + *refcount = 1; + } +} + +void cv::gpu::GpuMat::release() +{ + if (refcount && CV_XADD(refcount, -1) == 1) + { + fastFree(refcount); + + gpuFuncTable()->free(datastart); + } + + data = datastart = dataend = 0; + step = rows = cols = 0; + refcount = 0; +} diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 74ccc320e396e482adc9c413de40a707499e87d0..fcef9b9f629a75f90c3e3086e0f88270cc8c65ff 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -3,7 +3,8 @@ set(name "gpu") set(the_target "opencv_${name}") project(${the_target}) -set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed +set(DEPS "opencv_core" "opencv_imgproc" "opencv_calib3d" "opencv_objdetect") +set(DEPS_HEADER ${DEPS} "opencv_features2d" "opencv_flann") set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu) include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" @@ -27,6 +28,13 @@ file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*") source_group("Device" FILES ${lib_device_hdrs}) source_group("Device\\Detail" FILES ${lib_device_hdrs_detail}) +foreach(d ${DEPS_HEADER}) + if(${d} MATCHES "opencv_") + string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) + include_directories("${d_dir}/include") + endif() +endforeach() + if (HAVE_CUDA) file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") @@ -50,7 +58,6 @@ if (HAVE_CUDA) if (APPLE) set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;") endif() - string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") @@ -60,7 +67,7 @@ if (HAVE_CUDA) #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408 /wd4251") string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}") @@ -69,22 +76,19 @@ if (HAVE_CUDA) string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") endif() - + if (BUILD_SHARED_LIBS) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS") - endif() + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS") + endif() + if(MSVC) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/wd4251") + endif() + CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) #CUDA_BUILD_CLEAN_TARGET() endif() -foreach(d ${DEPS}) - if(${d} MATCHES "opencv_") - string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d}) - include_directories("${d_dir}/include") - endif() -endforeach() - add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) # For dynamic link numbering convenions diff --git a/modules/gpu/include/opencv2/gpu/devmem2d.hpp b/modules/gpu/include/opencv2/gpu/devmem2d.hpp index e454f000c8d5d5975c38f19fd33b806183b96302..33af66afa1d2ec9c906b1cb85b3214eaa102bb38 100644 --- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp +++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp @@ -40,122 +40,4 @@ // //M*/ -#ifndef __OPENCV_GPU_DevMem2D_HPP__ -#define __OPENCV_GPU_DevMem2D_HPP__ - - -namespace cv -{ - namespace gpu - { - // Simple lightweight structures that encapsulates information about an image on device. - // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile - -#if defined(__CUDACC__) - #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ -#else - #define __CV_GPU_HOST_DEVICE__ -#endif - - template struct StaticAssert; - template <> struct StaticAssert {static __CV_GPU_HOST_DEVICE__ void check(){}}; - - template struct DevPtr - { - typedef T elem_type; - typedef int index_type; - - enum { elem_size = sizeof(elem_type) }; - - T* data; - - __CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {} - __CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {} - - __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; } - __CV_GPU_HOST_DEVICE__ operator T*() { return data; } - __CV_GPU_HOST_DEVICE__ operator const T*() const { return data; } - }; - - template struct PtrSz : public DevPtr - { - __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {} - __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr(data_), size(size_) {} - - size_t size; - }; - - template struct PtrStep : public DevPtr - { - __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {} - __CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr(data_), step(step_) {} - - /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */ - size_t step; - - __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr::data + y * step); } - __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr::data + y * step); } - - __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; } - __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; } - }; - - template struct PtrStepSz : public PtrStep - { - __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {} - __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) - : PtrStep(data_, step_), cols(cols_), rows(rows_) {} - - int cols; - int rows; - }; - - template struct DevMem2D_ : public PtrStepSz - { - DevMem2D_() {} - DevMem2D_(int rows_, int cols_, T *data_, size_t step_) : PtrStepSz(rows_, cols_, data_, step_) {} - - template - explicit DevMem2D_(const DevMem2D_& d) : PtrStepSz(d.rows, d.cols, (T*)d.data, d.step) {} - }; - - template struct PtrElemStep_ : public PtrStep - { - PtrElemStep_(const DevMem2D_& mem) : PtrStep(mem.data, mem.step) - { - StaticAssert<256 % sizeof(T) == 0>::check(); - - PtrStep::step /= PtrStep::elem_size; - } - __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep::data + y * PtrStep::step; } - __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep::data + y * PtrStep::step; } - - __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; } - __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; } - }; - - template struct PtrStep_ : public PtrStep - { - PtrStep_() {} - PtrStep_(const DevMem2D_& mem) : PtrStep(mem.data, mem.step) {} - }; - -#undef __CV_GPU_HOST_DEVICE__ - - - typedef DevMem2D_ DevMem2Db; - typedef DevMem2Db DevMem2D; - typedef DevMem2D_ DevMem2Df; - typedef DevMem2D_ DevMem2Di; - - typedef PtrStep PtrStepb; - typedef PtrStep PtrStepf; - typedef PtrStep PtrStepi; - - typedef PtrElemStep_ PtrElemStep; - typedef PtrElemStep_ PtrElemStepf; - typedef PtrElemStep_ PtrElemStepi; - } -} - -#endif /* __OPENCV_GPU_DevMem2D_HPP__ */ +#include "opencv2/core/devmem2d.hpp" diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 38f6a955b90ce109bfe329281f4c494db029a483..ffa32fbe371f73481ce757543ebf9a1e45d0194a 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -43,1520 +43,1530 @@ #ifndef __OPENCV_GPU_HPP__ #define __OPENCV_GPU_HPP__ +#ifndef SKIP_INCLUDES #include -#include "opencv2/core/core.hpp" +#endif + +#include "opencv2/core/gpumat.hpp" #include "opencv2/imgproc/imgproc.hpp" #include "opencv2/objdetect/objdetect.hpp" #include "opencv2/features2d/features2d.hpp" -#include "opencv2/gpu/gpumat.hpp" -namespace cv -{ - namespace gpu - { - //////////////////////////////// Initialization & Info //////////////////////// - - //! This is the only function that do not throw exceptions if the library is compiled without Cuda. - CV_EXPORTS int getCudaEnabledDeviceCount(); - - //! Functions below throw cv::Expception if the library is compiled without Cuda. - - CV_EXPORTS void setDevice(int device); - CV_EXPORTS int getDevice(); - - //! Explicitly destroys and cleans up all resources associated with the current device in the current process. - //! Any subsequent API call to this device will reinitialize the device. - CV_EXPORTS void resetDevice(); - - enum FeatureSet - { - FEATURE_SET_COMPUTE_10 = 10, - FEATURE_SET_COMPUTE_11 = 11, - FEATURE_SET_COMPUTE_12 = 12, - FEATURE_SET_COMPUTE_13 = 13, - FEATURE_SET_COMPUTE_20 = 20, - FEATURE_SET_COMPUTE_21 = 21, - GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11, - SHARED_ATOMICS = FEATURE_SET_COMPUTE_12, - NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13 - }; +namespace cv { namespace gpu { - // Gives information about what GPU archs this OpenCV GPU module was - // compiled for - class CV_EXPORTS TargetArchs - { - public: - static bool builtWith(FeatureSet feature_set); - static bool has(int major, int minor); - static bool hasPtx(int major, int minor); - static bool hasBin(int major, int minor); - static bool hasEqualOrLessPtx(int major, int minor); - static bool hasEqualOrGreater(int major, int minor); - static bool hasEqualOrGreaterPtx(int major, int minor); - static bool hasEqualOrGreaterBin(int major, int minor); - private: - TargetArchs(); - }; +//////////////////////////////// Initialization & Info //////////////////////// - // Gives information about the given GPU - class CV_EXPORTS DeviceInfo - { - public: - // Creates DeviceInfo object for the current GPU - DeviceInfo() : device_id_(getDevice()) { query(); } +//! This is the only function that do not throw exceptions if the library is compiled without Cuda. +CV_EXPORTS int getCudaEnabledDeviceCount(); - // Creates DeviceInfo object for the given GPU - DeviceInfo(int device_id) : device_id_(device_id) { query(); } +//! Functions below throw cv::Expception if the library is compiled without Cuda. - string name() const { return name_; } - - // Return compute capability versions - int majorVersion() const { return majorVersion_; } - int minorVersion() const { return minorVersion_; } +CV_EXPORTS void setDevice(int device); +CV_EXPORTS int getDevice(); - int multiProcessorCount() const { return multi_processor_count_; } +//! Explicitly destroys and cleans up all resources associated with the current device in the current process. +//! Any subsequent API call to this device will reinitialize the device. +CV_EXPORTS void resetDevice(); - size_t freeMemory() const; - size_t totalMemory() const; +enum FeatureSet +{ + FEATURE_SET_COMPUTE_10 = 10, + FEATURE_SET_COMPUTE_11 = 11, + FEATURE_SET_COMPUTE_12 = 12, + FEATURE_SET_COMPUTE_13 = 13, + FEATURE_SET_COMPUTE_20 = 20, + FEATURE_SET_COMPUTE_21 = 21, + GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11, + SHARED_ATOMICS = FEATURE_SET_COMPUTE_12, + NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13 +}; + +// Gives information about what GPU archs this OpenCV GPU module was +// compiled for +class CV_EXPORTS TargetArchs +{ +public: + static bool builtWith(FeatureSet feature_set); + static bool has(int major, int minor); + static bool hasPtx(int major, int minor); + static bool hasBin(int major, int minor); + static bool hasEqualOrLessPtx(int major, int minor); + static bool hasEqualOrGreater(int major, int minor); + static bool hasEqualOrGreaterPtx(int major, int minor); + static bool hasEqualOrGreaterBin(int major, int minor); +private: + TargetArchs(); +}; + +// Gives information about the given GPU +class CV_EXPORTS DeviceInfo +{ +public: + // Creates DeviceInfo object for the current GPU + DeviceInfo() : device_id_(getDevice()) { query(); } - // Checks whether device supports the given feature - bool supports(FeatureSet feature_set) const; + // Creates DeviceInfo object for the given GPU + DeviceInfo(int device_id) : device_id_(device_id) { query(); } - // Checks whether the GPU module can be run on the given device - bool isCompatible() const; + std::string name() const { return name_; } - int deviceID() const { return device_id_; } + // Return compute capability versions + int majorVersion() const { return majorVersion_; } + int minorVersion() const { return minorVersion_; } - private: - void query(); - void queryMemory(size_t& free_memory, size_t& total_memory) const; + int multiProcessorCount() const { return multi_processor_count_; } - int device_id_; + size_t freeMemory() const; + size_t totalMemory() const; - string name_; - int multi_processor_count_; - int majorVersion_; - int minorVersion_; - }; + // Checks whether device supports the given feature + bool supports(FeatureSet feature_set) const; - //////////////////////////////// Error handling //////////////////////// + // Checks whether the GPU module can be run on the given device + bool isCompatible() const; - CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func); + int deviceID() const { return device_id_; } - //////////////////////////////// CudaMem //////////////////////////////// - // CudaMem is limited cv::Mat with page locked memory allocation. - // Page locked memory is only needed for async and faster coping to GPU. - // It is convertable to cv::Mat header without reference counting - // so you can use it with other opencv functions. +private: + void query(); + void queryMemory(size_t& free_memory, size_t& total_memory) const; - // Page-locks the matrix m memory and maps it for the device(s) - CV_EXPORTS void registerPageLocked(Mat& m); - // Unmaps the memory of matrix m, and makes it pageable again. - CV_EXPORTS void unregisterPageLocked(Mat& m); + int device_id_; - class CV_EXPORTS CudaMem - { - public: - enum { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 }; + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; +}; - CudaMem(); - CudaMem(const CudaMem& m); +//////////////////////////////// Error handling //////////////////////// - CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED); - CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); +CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func); +//////////////////////////////// CudaMem //////////////////////////////// +// CudaMem is limited cv::Mat with page locked memory allocation. +// Page locked memory is only needed for async and faster coping to GPU. +// It is convertable to cv::Mat header without reference counting +// so you can use it with other opencv functions. - //! creates from cv::Mat with coping data - explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED); +// Page-locks the matrix m memory and maps it for the device(s) +CV_EXPORTS void registerPageLocked(Mat& m); +// Unmaps the memory of matrix m, and makes it pageable again. +CV_EXPORTS void unregisterPageLocked(Mat& m); - ~CudaMem(); +class CV_EXPORTS CudaMem +{ +public: + enum { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 }; - CudaMem& operator = (const CudaMem& m); + CudaMem(); + CudaMem(const CudaMem& m); - //! returns deep copy of the matrix, i.e. the data is copied - CudaMem clone() const; + CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED); + CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); - //! allocates new matrix data unless the matrix already has specified size and type. - void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED); - void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); - //! decrements reference counter and released memory if needed. - void release(); + //! creates from cv::Mat with coping data + explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED); - //! returns matrix header with disabled reference counting for CudaMem data. - Mat createMatHeader() const; - operator Mat() const; + ~CudaMem(); - //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware. - GpuMat createGpuMatHeader() const; - operator GpuMat() const; + CudaMem& operator = (const CudaMem& m); - //returns if host memory can be mapperd to gpu address space; - static bool canMapHostMemory(); + //! returns deep copy of the matrix, i.e. the data is copied + CudaMem clone() const; - // Please see cv::Mat for descriptions - bool isContinuous() const; - size_t elemSize() const; - size_t elemSize1() const; - int type() const; - int depth() const; - int channels() const; - size_t step1() const; - Size size() const; - bool empty() const; + //! allocates new matrix data unless the matrix already has specified size and type. + void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED); + void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); + //! decrements reference counter and released memory if needed. + void release(); - // Please see cv::Mat for descriptions - int flags; - int rows, cols; - size_t step; + //! returns matrix header with disabled reference counting for CudaMem data. + Mat createMatHeader() const; + operator Mat() const; - uchar* data; - int* refcount; + //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware. + GpuMat createGpuMatHeader() const; + operator GpuMat() const; - uchar* datastart; - uchar* dataend; + //returns if host memory can be mapperd to gpu address space; + static bool canMapHostMemory(); - int alloc_type; - }; + // Please see cv::Mat for descriptions + bool isContinuous() const; + size_t elemSize() const; + size_t elemSize1() const; + int type() const; + int depth() const; + int channels() const; + size_t step1() const; + Size size() const; + bool empty() const; - //////////////////////////////// CudaStream //////////////////////////////// - // Encapculates Cuda Stream. Provides interface for async coping. - // Passed to each function that supports async kernel execution. - // Reference counting is enabled - class CV_EXPORTS Stream - { - public: - Stream(); - ~Stream(); + // Please see cv::Mat for descriptions + int flags; + int rows, cols; + size_t step; + + uchar* data; + int* refcount; + + uchar* datastart; + uchar* dataend; + + int alloc_type; +}; + +//////////////////////////////// CudaStream //////////////////////////////// +// Encapculates Cuda Stream. Provides interface for async coping. +// Passed to each function that supports async kernel execution. +// Reference counting is enabled + +class CV_EXPORTS Stream +{ +public: + Stream(); + ~Stream(); - Stream(const Stream&); - Stream& operator=(const Stream&); + Stream(const Stream&); + Stream& operator=(const Stream&); - bool queryIfComplete(); - void waitForCompletion(); + bool queryIfComplete(); + void waitForCompletion(); - //! downloads asynchronously. - // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat) - void enqueueDownload(const GpuMat& src, CudaMem& dst); - void enqueueDownload(const GpuMat& src, Mat& dst); + //! downloads asynchronously. + // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its subMat) + void enqueueDownload(const GpuMat& src, CudaMem& dst); + void enqueueDownload(const GpuMat& src, Mat& dst); - //! uploads asynchronously. - // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI) - void enqueueUpload(const CudaMem& src, GpuMat& dst); - void enqueueUpload(const Mat& src, GpuMat& dst); + //! uploads asynchronously. + // Warning! cv::Mat must point to page locked memory (i.e. to CudaMem data or to its ROI) + void enqueueUpload(const CudaMem& src, GpuMat& dst); + void enqueueUpload(const Mat& src, GpuMat& dst); - void enqueueCopy(const GpuMat& src, GpuMat& dst); + void enqueueCopy(const GpuMat& src, GpuMat& dst); - void enqueueMemSet(GpuMat& src, Scalar val); - void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask); + void enqueueMemSet(GpuMat& src, Scalar val); + void enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask); - // converts matrix type, ex from float to uchar depending on type - void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); + // converts matrix type, ex from float to uchar depending on type + void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0); - static Stream& Null(); + static Stream& Null(); - operator bool() const; + operator bool() const; - private: - void create(); - void release(); +private: + void create(); + void release(); - struct Impl; - Impl *impl; + struct Impl; + Impl *impl; - friend struct StreamAccessor; - - explicit Stream(Impl* impl); - }; + friend struct StreamAccessor; + + explicit Stream(Impl* impl); +}; - //////////////////////////////// Filter Engine //////////////////////////////// - - /*! - The Base Class for 1D or Row-wise Filters - - This is the base class for linear or non-linear filters that process 1D data. - In particular, such filters are used for the "horizontal" filtering parts in separable filters. - */ - class CV_EXPORTS BaseRowFilter_GPU - { - public: - BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} - virtual ~BaseRowFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; - int ksize, anchor; - }; - - /*! - The Base Class for Column-wise Filters - - This is the base class for linear or non-linear filters that process columns of 2D arrays. - Such filters are used for the "vertical" filtering parts in separable filters. - */ - class CV_EXPORTS BaseColumnFilter_GPU - { - public: - BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} - virtual ~BaseColumnFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; - int ksize, anchor; - }; - - /*! - The Base Class for Non-Separable 2D Filters. - - This is the base class for linear or non-linear 2D filters. - */ - class CV_EXPORTS BaseFilter_GPU - { - public: - BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {} - virtual ~BaseFilter_GPU() {} - virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; - Size ksize; - Point anchor; - }; - - /*! - The Base Class for Filter Engine. - - The class can be used to apply an arbitrary filtering operation to an image. - It contains all the necessary intermediate buffers. - */ - class CV_EXPORTS FilterEngine_GPU - { - public: - virtual ~FilterEngine_GPU() {} - - virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0; - }; - - //! returns the non-separable filter engine with the specified filter - CV_EXPORTS Ptr createFilter2D_GPU(const Ptr& filter2D, int srcType, int dstType); - - //! returns the separable filter engine with the specified filters - CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter, int srcType, int bufType, int dstType); - CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, - const Ptr& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf); - - //! returns horizontal 1D box filter - //! supports only CV_8UC1 source type and CV_32FC1 sum type - CV_EXPORTS Ptr getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1); - - //! returns vertical 1D box filter - //! supports only CV_8UC1 sum type and CV_32FC1 dst type - CV_EXPORTS Ptr getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1); - - //! returns 2D box filter - //! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type - CV_EXPORTS Ptr getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1)); - - //! returns box filter engine - CV_EXPORTS Ptr createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, - const Point& anchor = Point(-1,-1)); - - //! returns 2D morphological filter - //! only MORPH_ERODE and MORPH_DILATE are supported - //! supports CV_8UC1 and CV_8UC4 types - //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height - CV_EXPORTS Ptr getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, - Point anchor=Point(-1,-1)); - - //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. - CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, - const Point& anchor = Point(-1,-1), int iterations = 1); - CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, - const Point& anchor = Point(-1,-1), int iterations = 1); - - //! returns 2D filter with the specified kernel - //! supports CV_8UC1 and CV_8UC4 types - CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, - Point anchor = Point(-1, -1)); - - //! returns the non-separable linear filter engine - CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, - const Point& anchor = Point(-1,-1)); - - //! returns the primitive row filter with the specified kernel. - //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type. - //! there are two version of algorithm: NPP and OpenCV. - //! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType, - //! otherwise calls OpenCV version. - //! NPP supports only BORDER_CONSTANT border type. - //! OpenCV version supports only CV_32F as buffer depth and - //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. - CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, - int anchor = -1, int borderType = BORDER_DEFAULT); - - //! returns the primitive column filter with the specified kernel. - //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type. - //! there are two version of algorithm: NPP and OpenCV. - //! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType, - //! otherwise calls OpenCV version. - //! NPP supports only BORDER_CONSTANT border type. - //! OpenCV version supports only CV_32F as buffer depth and - //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. - CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, - int anchor = -1, int borderType = BORDER_DEFAULT); - - //! returns the separable linear filter engine - CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, - const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, - int columnBorderType = -1); - CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, - const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, - int columnBorderType = -1); - - //! returns filter engine for the generalized Sobel operator - CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - - //! returns the Gaussian filter engine - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - - //! returns maximum filter - CV_EXPORTS Ptr getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); - - //! returns minimum filter - CV_EXPORTS Ptr getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); - - //! smooths the image using the normalized box filter - //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()); - - //! a synonym for normalized box filter - static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); } - - //! erodes the image (applies the local minimum operator) - CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); - CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); - - //! dilates the image (applies the local maximum operator) - CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); - CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); - - //! applies an advanced morphological operation to the image - CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); - CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); - - //! applies non-separable 2D linear filter to the image - CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); - - //! applies separable 2D linear filter to the image - CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, - Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, - Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); - - //! applies generalized Sobel operator to the image - CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); - - //! applies the vertical or horizontal Scharr operator to the image - CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); - - //! smooths the image using Gaussian filter. - CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); - CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, - int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); - - //! applies Laplacian operator to the image - //! supports only ksize = 1 and ksize = 3 - CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null()); - - - ////////////////////////////// Arithmetics /////////////////////////////////// - - //! implements generalized matrix product algorithm GEMM from BLAS - CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, - const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); - - //! transposes the matrix - //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) - CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null()); - - //! reverses the order of the rows, columns or both in a matrix - //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null()); - - //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) - //! destination array will have the depth type as lut and the same channels number as source - //! supports CV_8UC1, CV_8UC3 types - CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null()); - - //! makes multi-channel array out of several single-channel arrays - CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null()); - - //! makes multi-channel array out of several single-channel arrays - CV_EXPORTS void merge(const vector& src, GpuMat& dst, Stream& stream = Stream::Null()); - - //! copies each plane of a multi-channel array to a dedicated array - CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null()); - - //! copies each plane of a multi-channel array to a dedicated array - CV_EXPORTS void split(const GpuMat& src, vector& dst, Stream& stream = Stream::Null()); - - //! computes magnitude of complex (x(i).re, x(i).im) vector - //! supports only CV_32FC2 type - CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); - - //! computes squared magnitude of complex (x(i).re, x(i).im) vector - //! supports only CV_32FC2 type - CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); - - //! computes magnitude of each (x(i), y(i)) vector - //! supports only floating-point source - CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); - - //! computes squared magnitude of each (x(i), y(i)) vector - //! supports only floating-point source - CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); - - //! computes angle (angle(i)) of each (x(i), y(i)) vector - //! supports only floating-point source - CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); - - //! converts Cartesian coordinates to polar - //! supports only floating-point source - CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); - - //! converts polar coordinates to Cartesian - //! supports only floating-point source - CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null()); +//////////////////////////////// Filter Engine //////////////////////////////// +/*! +The Base Class for 1D or Row-wise Filters - //////////////////////////// Per-element operations //////////////////////////////////// - - //! adds one matrix to another (c = a + b) - CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - //! adds scalar to a matrix (c = a + s) - CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - - //! subtracts one matrix from another (c = a - b) - CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - //! subtracts scalar from a matrix (c = a - s) - CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - - //! computes element-wise weighted product of the two arrays (c = scale * a * b) - CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); - //! weighted multiplies matrix to a scalar (c = scale * a * s) - CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); +This is the base class for linear or non-linear filters that process 1D data. +In particular, such filters are used for the "horizontal" filtering parts in separable filters. +*/ +class CV_EXPORTS BaseRowFilter_GPU +{ +public: + BaseRowFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} + virtual ~BaseRowFilter_GPU() {} + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; + int ksize, anchor; +}; + +/*! +The Base Class for Column-wise Filters + +This is the base class for linear or non-linear filters that process columns of 2D arrays. +Such filters are used for the "vertical" filtering parts in separable filters. +*/ +class CV_EXPORTS BaseColumnFilter_GPU +{ +public: + BaseColumnFilter_GPU(int ksize_, int anchor_) : ksize(ksize_), anchor(anchor_) {} + virtual ~BaseColumnFilter_GPU() {} + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; + int ksize, anchor; +}; + +/*! +The Base Class for Non-Separable 2D Filters. + +This is the base class for linear or non-linear 2D filters. +*/ +class CV_EXPORTS BaseFilter_GPU +{ +public: + BaseFilter_GPU(const Size& ksize_, const Point& anchor_) : ksize(ksize_), anchor(anchor_) {} + virtual ~BaseFilter_GPU() {} + virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0; + Size ksize; + Point anchor; +}; + +/*! +The Base Class for Filter Engine. + +The class can be used to apply an arbitrary filtering operation to an image. +It contains all the necessary intermediate buffers. +*/ +class CV_EXPORTS FilterEngine_GPU +{ +public: + virtual ~FilterEngine_GPU() {} + + virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0; +}; + +//! returns the non-separable filter engine with the specified filter +CV_EXPORTS Ptr createFilter2D_GPU(const Ptr& filter2D, int srcType, int dstType); + +//! returns the separable filter engine with the specified filters +CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, + const Ptr& columnFilter, int srcType, int bufType, int dstType); +CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, + const Ptr& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf); + +//! returns horizontal 1D box filter +//! supports only CV_8UC1 source type and CV_32FC1 sum type +CV_EXPORTS Ptr getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1); + +//! returns vertical 1D box filter +//! supports only CV_8UC1 sum type and CV_32FC1 dst type +CV_EXPORTS Ptr getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1); + +//! returns 2D box filter +//! supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type +CV_EXPORTS Ptr getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1)); + +//! returns box filter engine +CV_EXPORTS Ptr createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, + const Point& anchor = Point(-1,-1)); + +//! returns 2D morphological filter +//! only MORPH_ERODE and MORPH_DILATE are supported +//! supports CV_8UC1 and CV_8UC4 types +//! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height +CV_EXPORTS Ptr getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, + Point anchor=Point(-1,-1)); + +//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. +CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, + const Point& anchor = Point(-1,-1), int iterations = 1); +CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, + const Point& anchor = Point(-1,-1), int iterations = 1); + +//! returns 2D filter with the specified kernel +//! supports CV_8UC1 and CV_8UC4 types +CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, + Point anchor = Point(-1, -1)); + +//! returns the non-separable linear filter engine +CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, + const Point& anchor = Point(-1,-1)); + +//! returns the primitive row filter with the specified kernel. +//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type. +//! there are two version of algorithm: NPP and OpenCV. +//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType, +//! otherwise calls OpenCV version. +//! NPP supports only BORDER_CONSTANT border type. +//! OpenCV version supports only CV_32F as buffer depth and +//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. +CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, + int anchor = -1, int borderType = BORDER_DEFAULT); + +//! returns the primitive column filter with the specified kernel. +//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type. +//! there are two version of algorithm: NPP and OpenCV. +//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType, +//! otherwise calls OpenCV version. +//! NPP supports only BORDER_CONSTANT border type. +//! OpenCV version supports only CV_32F as buffer depth and +//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. +CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, + int anchor = -1, int borderType = BORDER_DEFAULT); + +//! returns the separable linear filter engine +CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, + const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, + int columnBorderType = -1); +CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, + const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, + int columnBorderType = -1); + +//! returns filter engine for the generalized Sobel operator +CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + +//! returns the Gaussian filter engine +CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + +//! returns maximum filter +CV_EXPORTS Ptr getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); + +//! returns minimum filter +CV_EXPORTS Ptr getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); + +//! smooths the image using the normalized box filter +//! supports CV_8UC1, CV_8UC4 types +CV_EXPORTS void boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()); + +//! a synonym for normalized box filter +static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) +{ + boxFilter(src, dst, -1, ksize, anchor, stream); +} - //! computes element-wise weighted quotient of the two arrays (c = a / b) - CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); - //! computes element-wise weighted quotient of matrix and scalar (c = a / s) - CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); - //! computes element-wise weighted reciprocal of an array (dst = scale/src2) - CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null()); +//! erodes the image (applies the local minimum operator) +CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); +CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, + Point anchor = Point(-1, -1), int iterations = 1, + Stream& stream = Stream::Null()); - //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma) - CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, - int dtype = -1, Stream& stream = Stream::Null()); +//! dilates the image (applies the local maximum operator) +CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); +CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, + Point anchor = Point(-1, -1), int iterations = 1, + Stream& stream = Stream::Null()); - //! adds scaled array to another one (dst = alpha*src1 + src2) - static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()) - { - addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream); - } +//! applies an advanced morphological operation to the image +CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); +CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, + Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); - //! computes element-wise absolute difference of two arrays (c = abs(a - b)) - CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); - //! computes element-wise absolute difference of array and scalar (c = abs(a - s)) - CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); +//! applies non-separable 2D linear filter to the image +CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); - //! computes exponent of each matrix element (b = e**a) - //! supports only CV_32FC1 type - CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); - - //! computes power of each matrix element: - // (dst(i,j) = pow( src(i,j) , power), if src.type() is integer - // (dst(i,j) = pow(fabs(src(i,j)), power), otherwise - //! supports all, except depth == CV_64F - CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null()); +//! applies separable 2D linear filter to the image +CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, + Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, + Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, + Stream& stream = Stream::Null()); + +//! applies generalized Sobel operator to the image +CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); + +//! applies the vertical or horizontal Scharr operator to the image +CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); + +//! smooths the image using Gaussian filter. +CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); +CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); + +//! applies Laplacian operator to the image +//! supports only ksize = 1 and ksize = 3 +CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, Stream& stream = Stream::Null()); + + +////////////////////////////// Arithmetics /////////////////////////////////// + +//! implements generalized matrix product algorithm GEMM from BLAS +CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, + const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); + +//! transposes the matrix +//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) +CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null()); + +//! reverses the order of the rows, columns or both in a matrix +//! supports CV_8UC1, CV_8UC4 types +CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null()); + +//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) +//! destination array will have the depth type as lut and the same channels number as source +//! supports CV_8UC1, CV_8UC3 types +CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null()); + +//! makes multi-channel array out of several single-channel arrays +CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null()); + +//! makes multi-channel array out of several single-channel arrays +CV_EXPORTS void merge(const vector& src, GpuMat& dst, Stream& stream = Stream::Null()); + +//! copies each plane of a multi-channel array to a dedicated array +CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null()); + +//! copies each plane of a multi-channel array to a dedicated array +CV_EXPORTS void split(const GpuMat& src, vector& dst, Stream& stream = Stream::Null()); + +//! computes magnitude of complex (x(i).re, x(i).im) vector +//! supports only CV_32FC2 type +CV_EXPORTS void magnitude(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); + +//! computes squared magnitude of complex (x(i).re, x(i).im) vector +//! supports only CV_32FC2 type +CV_EXPORTS void magnitudeSqr(const GpuMat& x, GpuMat& magnitude, Stream& stream = Stream::Null()); + +//! computes magnitude of each (x(i), y(i)) vector +//! supports only floating-point source +CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); + +//! computes squared magnitude of each (x(i), y(i)) vector +//! supports only floating-point source +CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null()); + +//! computes angle (angle(i)) of each (x(i), y(i)) vector +//! supports only floating-point source +CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); + +//! converts Cartesian coordinates to polar +//! supports only floating-point source +CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null()); + +//! converts polar coordinates to Cartesian +//! supports only floating-point source +CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null()); - //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) - //! supports only CV_32FC1 type - CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); - //! compares elements of two arrays (c = a b) - //! supports CV_8UC4, CV_32FC1 types - CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null()); +//////////////////////////// Per-element operations //////////////////////////////////// - //! performs per-elements bit-wise inversion - CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! adds one matrix to another (c = a + b) +CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); +//! adds scalar to a matrix (c = a + s) +CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - //! calculates per-element bit-wise disjunction of two arrays - CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! subtracts one matrix from another (c = a - b) +CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); +//! subtracts scalar from a matrix (c = a - s) +CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null()); - //! calculates per-element bit-wise conjunction of two arrays - CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! computes element-wise weighted product of the two arrays (c = scale * a * b) +CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); +//! weighted multiplies matrix to a scalar (c = scale * a * s) +CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); - //! calculates per-element bit-wise "exclusive or" operation - CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); +//! computes element-wise weighted quotient of the two arrays (c = a / b) +CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); +//! computes element-wise weighted quotient of matrix and scalar (c = a / s) +CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null()); +//! computes element-wise weighted reciprocal of an array (dst = scale/src2) +CV_EXPORTS void divide(double scale, const GpuMat& src2, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null()); - //! computes per-element minimum of two arrays (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); +//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma) +CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, + int dtype = -1, Stream& stream = Stream::Null()); - //! computes per-element minimum of array and scalar (dst = min(src1, src2)) - CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); +//! adds scaled array to another one (dst = alpha*src1 + src2) +static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()) +{ + addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream); +} - //! computes per-element maximum of two arrays (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); +//! computes element-wise absolute difference of two arrays (c = abs(a - b)) +CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null()); +//! computes element-wise absolute difference of array and scalar (c = abs(a - s)) +CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null()); - //! computes per-element maximum of array and scalar (dst = max(src1, src2)) - CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); +//! computes exponent of each matrix element (b = e**a) +//! supports only CV_32FC1 type +CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); +//! computes power of each matrix element: +// (dst(i,j) = pow( src(i,j) , power), if src.type() is integer +// (dst(i,j) = pow(fabs(src(i,j)), power), otherwise +//! supports all, except depth == CV_64F +CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null()); - ////////////////////////////// Image processing ////////////////////////////// +//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a)) +//! supports only CV_32FC1 type +CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null()); - //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]] - //! supports only CV_32FC1 map type - CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, - int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar(), - Stream& stream = Stream::Null()); +//! compares elements of two arrays (c = a b) +//! supports CV_8UC4, CV_32FC1 types +CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null()); - //! Does mean shift filtering on GPU. - CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null()); +//! performs per-elements bit-wise inversion +CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); - //! Does mean shift procedure on GPU. - CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), Stream& stream = Stream::Null()); +//! calculates per-element bit-wise disjunction of two arrays +CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); - //! Does mean shift segmentation with elimination of small regions. - CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, - TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); +//! calculates per-element bit-wise conjunction of two arrays +CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); - //! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV. - //! Supported types of input disparity: CV_8U, CV_16S. - //! Output disparity has CV_8UC4 type in BGRA format (alpha = 255). - CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null()); - - //! Reprojects disparity image to 3D space. - //! Supports CV_8U and CV_16S types of input disparity. - //! The output is a 4-channel floating-point (CV_32FC4) matrix. - //! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map. - //! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify. - CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null()); - - //! converts image from one color space to another - CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null()); - - //! applies fixed threshold to the image - CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null()); - - //! resizes the image - //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); - - //! warps the image using affine transformation - //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); - - //! warps the image using perspective transformation - //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); - - //! builds plane warping maps - CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale, - GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); - - //! builds cylindrical warping maps - CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, - GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); - - //! builds spherical warping maps - CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, - GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); - - //! rotate 8bit single or four channel image - //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC - //! supports CV_8UC1, CV_8UC4 types - CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, - int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); - - //! copies 2D array to a larger destination array and pads borders with user-specifiable constant - CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, - const Scalar& value = Scalar(), Stream& stream = Stream::Null()); - - //! computes the integral image - //! sum will have CV_32S type, but will contain unsigned int values - //! supports only CV_8UC1 source type - CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()); - - //! buffered version - CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null()); - - //! computes the integral image and integral for the squared image - //! sum will have CV_32S type, sqsum - CV32F type - //! supports only CV_8UC1 source type - CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null()); - - //! computes squared integral image - //! result matrix will have 64F type, but will contain 64U values - //! supports source images of 8UC1 type only - CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null()); - - //! computes vertical sum, supports only CV_32FC1 images - CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum); - - //! computes the standard deviation of integral images - //! supports only CV_32SC1 source type and CV_32FC1 sqr type - //! output will have CV_32FC1 type - CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null()); - - //! computes Harris cornerness criteria at each image pixel - CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, - int borderType = BORDER_REFLECT101); - CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, - int borderType = BORDER_REFLECT101); - CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, - int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null()); - - //! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria - CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101); - CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101); - CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, - int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null()); - - //! performs per-element multiplication of two full (not packed) Fourier spectrums - //! supports 32FC2 matrixes only (interleaved format) - CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null()); - - //! performs per-element multiplication of two full (not packed) Fourier spectrums - //! supports 32FC2 matrixes only (interleaved format) - CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null()); - - //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. - //! Param dft_size is the size of DFT transform. - //! - //! If the source matrix is not continous, then additional copy will be done, - //! so to avoid copying ensure the source matrix is continous one. If you want to use - //! preallocated output ensure it is continuous too, otherwise it will be reallocated. - //! - //! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values - //! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved. - //! - //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. - CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); - - //! computes convolution (or cross-correlation) of two images using discrete Fourier transform - //! supports source images of 32FC1 type only - //! result matrix will have 32FC1 type - struct CV_EXPORTS ConvolveBuf; - CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); - CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); - - struct CV_EXPORTS ConvolveBuf - { - ConvolveBuf() {} - ConvolveBuf(Size image_size, Size templ_size) - { create(image_size, templ_size); } - void create(Size image_size, Size templ_size); - void create(Size image_size, Size templ_size, Size block_size); - - private: - static Size estimateBlockSize(Size result_size, Size templ_size); - friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream); - - Size result_size; - Size block_size; - Size dft_size; - int spect_len; - - GpuMat image_spect, templ_spect, result_spect; - GpuMat image_block, templ_block, result_data; - }; - - //! computes the proximity map for the raster template and the image where the template is searched for - CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null()); - - //! smoothes the source image and downsamples it - CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); - - //! upsamples the source image and then smoothes it - CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); - - //! performs linear blending of two images - //! to avoid accuracy errors sum of weigths shouldn't be very close to zero - CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, - GpuMat& result, Stream& stream = Stream::Null()); +//! calculates per-element bit-wise "exclusive or" operation +CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null()); - - struct CV_EXPORTS CannyBuf; - - CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); - CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); - CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); - CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); +//! computes per-element minimum of two arrays (dst = min(src1, src2)) +CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); - struct CV_EXPORTS CannyBuf - { - CannyBuf() {} - explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} - CannyBuf(const GpuMat& dx_, const GpuMat& dy_); +//! computes per-element minimum of array and scalar (dst = min(src1, src2)) +CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); - void create(const Size& image_size, int apperture_size = 3); - - void release(); +//! computes per-element maximum of two arrays (dst = max(src1, src2)) +CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null()); - GpuMat dx, dy; - GpuMat dx_buf, dy_buf; - GpuMat edgeBuf; - GpuMat trackBuf1, trackBuf2; - Ptr filterDX, filterDY; - }; +//! computes per-element maximum of array and scalar (dst = max(src1, src2)) +CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null()); + + +////////////////////////////// Image processing ////////////////////////////// + +//! DST[x,y] = SRC[xmap[x,y],ymap[x,y]] +//! supports only CV_32FC1 map type +CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, + int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar(), + Stream& stream = Stream::Null()); + +//! Does mean shift filtering on GPU. +CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), + Stream& stream = Stream::Null()); + +//! Does mean shift procedure on GPU. +CV_EXPORTS void meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1), + Stream& stream = Stream::Null()); + +//! Does mean shift segmentation with elimination of small regions. +CV_EXPORTS void meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + +//! Does coloring of disparity image: [0..ndisp) -> [0..240, 1, 1] in HSV. +//! Supported types of input disparity: CV_8U, CV_16S. +//! Output disparity has CV_8UC4 type in BGRA format (alpha = 255). +CV_EXPORTS void drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null()); + +//! Reprojects disparity image to 3D space. +//! Supports CV_8U and CV_16S types of input disparity. +//! The output is a 4-channel floating-point (CV_32FC4) matrix. +//! Each element of this matrix will contain the 3D coordinates of the point (x,y,z,1), computed from the disparity map. +//! Q is the 4x4 perspective transformation matrix that can be obtained with cvStereoRectify. +CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, Stream& stream = Stream::Null()); + +//! converts image from one color space to another +CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null()); + +//! applies fixed threshold to the image +CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null()); + +//! resizes the image +//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC +CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); + +//! warps the image using affine transformation +//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC +CV_EXPORTS void warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); + +//! warps the image using perspective transformation +//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC +CV_EXPORTS void warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags = INTER_LINEAR, Stream& stream = Stream::Null()); + +//! builds plane warping maps +CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, float scale, + GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); + +//! builds cylindrical warping maps +CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, + GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); + +//! builds spherical warping maps +CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale, + GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null()); + +//! rotate 8bit single or four channel image +//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC +//! supports CV_8UC1, CV_8UC4 types +CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, + int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); + +//! copies 2D array to a larger destination array and pads borders with user-specifiable constant +CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, + const Scalar& value = Scalar(), Stream& stream = Stream::Null()); + +//! computes the integral image +//! sum will have CV_32S type, but will contain unsigned int values +//! supports only CV_8UC1 source type +CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null()); + +//! buffered version +CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null()); + +//! computes the integral image and integral for the squared image +//! sum will have CV_32S type, sqsum - CV32F type +//! supports only CV_8UC1 source type +CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null()); + +//! computes squared integral image +//! result matrix will have 64F type, but will contain 64U values +//! supports source images of 8UC1 type only +CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& stream = Stream::Null()); + +//! computes vertical sum, supports only CV_32FC1 images +CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum); + +//! computes the standard deviation of integral images +//! supports only CV_32SC1 source type and CV_32FC1 sqr type +//! output will have CV_32FC1 type +CV_EXPORTS void rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& stream = Stream::Null()); + +//! computes Harris cornerness criteria at each image pixel +CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101); +CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, double k, int borderType = BORDER_REFLECT101); +CV_EXPORTS void cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, + int borderType = BORDER_REFLECT101, Stream& stream = Stream::Null()); + +//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria +CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType=BORDER_REFLECT101); +CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, int blockSize, int ksize, int borderType=BORDER_REFLECT101); +CV_EXPORTS void cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, + int borderType=BORDER_REFLECT101, Stream& stream = Stream::Null()); + +//! performs per-element multiplication of two full (not packed) Fourier spectrums +//! supports 32FC2 matrixes only (interleaved format) +CV_EXPORTS void mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB=false, Stream& stream = Stream::Null()); + +//! performs per-element multiplication of two full (not packed) Fourier spectrums +//! supports 32FC2 matrixes only (interleaved format) +CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null()); + +//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. +//! Param dft_size is the size of DFT transform. +//! +//! If the source matrix is not continous, then additional copy will be done, +//! so to avoid copying ensure the source matrix is continous one. If you want to use +//! preallocated output ensure it is continuous too, otherwise it will be reallocated. +//! +//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values +//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved. +//! +//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. +CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); + +//! computes convolution (or cross-correlation) of two images using discrete Fourier transform +//! supports source images of 32FC1 type only +//! result matrix will have 32FC1 type +struct CV_EXPORTS ConvolveBuf; +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); + +struct CV_EXPORTS ConvolveBuf +{ + ConvolveBuf() {} + ConvolveBuf(Size image_size, Size templ_size) + { create(image_size, templ_size); } + void create(Size image_size, Size templ_size); + void create(Size image_size, Size templ_size, Size block_size); + +private: + static Size estimateBlockSize(Size result_size, Size templ_size); + friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream); + + Size result_size; + Size block_size; + Size dft_size; + int spect_len; + + GpuMat image_spect, templ_spect, result_spect; + GpuMat image_block, templ_block, result_data; +}; + +//! computes the proximity map for the raster template and the image where the template is searched for +CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null()); + +//! smoothes the source image and downsamples it +CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); + +//! upsamples the source image and then smoothes it +CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); + +//! performs linear blending of two images +//! to avoid accuracy errors sum of weigths shouldn't be very close to zero +CV_EXPORTS void blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, + GpuMat& result, Stream& stream = Stream::Null()); - ////////////////////////////// Matrix reductions ////////////////////////////// - - //! computes mean value and standard deviation of all or selected array elements - //! supports only CV_8UC1 type - CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); - - //! computes norm of array - //! supports NORM_INF, NORM_L1, NORM_L2 - //! supports all matrices except 64F - CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2); - - //! computes norm of array - //! supports NORM_INF, NORM_L1, NORM_L2 - //! supports all matrices except 64F - CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf); - - //! computes norm of the difference between two arrays - //! supports NORM_INF, NORM_L1, NORM_L2 - //! supports only CV_8UC1 type - CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2); - - //! computes sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sum(const GpuMat& src); - - //! computes sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); - - //! computes sum of array elements absolute values - //! supports only single channel images - CV_EXPORTS Scalar absSum(const GpuMat& src); - - //! computes sum of array elements absolute values - //! supports only single channel images - CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf); - - //! computes squared sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sqrSum(const GpuMat& src); - - //! computes squared sum of array elements - //! supports only single channel images - CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); - - //! finds global minimum and maximum array elements and returns their values - CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); - - //! finds global minimum and maximum array elements and returns their values - CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf); - - //! finds global minimum and maximum array elements and returns their values with locations - CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, - const GpuMat& mask=GpuMat()); - - //! finds global minimum and maximum array elements and returns their values with locations - CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, - const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf); - - //! counts non-zero array elements - CV_EXPORTS int countNonZero(const GpuMat& src); - - //! counts non-zero array elements - CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); - - //! reduces a matrix to a vector - CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null()); - - - ///////////////////////////// Calibration 3D ////////////////////////////////// - - CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - GpuMat& dst, Stream& stream = Stream::Null()); - - CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, - const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, - Stream& stream = Stream::Null()); - - CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, - const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, - int num_iters=100, float max_dist=8.0, int min_inlier_count=100, - vector* inliers=NULL); - - //////////////////////////////// Image Labeling //////////////////////////////// - - //!performs labeling via graph cuts - CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& stream = Stream::Null()); - - ////////////////////////////////// Histograms ////////////////////////////////// - - //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type. - CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel); - //! Calculates histogram with evenly distributed bins for signle channel source. - //! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types. - //! Output hist will have one row and histSize cols and CV_32SC1 type. - CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); - CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); - //! Calculates histogram with evenly distributed bins for four-channel source. - //! All channels of source are processed separately. - //! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types. - //! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type. - CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); - CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); - //! Calculates histogram with bins determined by levels array. - //! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. - //! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types. - //! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type. - CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null()); - CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null()); - //! Calculates histogram with bins determined by levels array. - //! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. - //! All channels of source are processed separately. - //! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types. - //! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type. - CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); - CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null()); - - //! Calculates histogram for 8u one channel image - //! Output hist will have one row, 256 cols and CV32SC1 type. - CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); - CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); - //! normalizes the grayscale image brightness and contrast by normalizing its histogram - CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); - CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); - CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); - - //////////////////////////////// StereoBM_GPU //////////////////////////////// - - class CV_EXPORTS StereoBM_GPU - { - public: - enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 }; - - enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 }; - - //! the default constructor - StereoBM_GPU(); - //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8. - StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ); - - //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair - //! Output disparity has CV_8U type. - void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); - - //! Some heuristics that tries to estmate - // if current GPU will be faster than CPU in this algorithm. - // It queries current active device. - static bool checkIfGpuCallReasonable(); - - int preset; - int ndisp; - int winSize; - - // If avergeTexThreshold == 0 => post procesing is disabled - // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image - // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold - // i.e. input left image is low textured. - float avergeTexThreshold; - private: - GpuMat minSSD, leBuf, riBuf; - }; - - ////////////////////////// StereoBeliefPropagation /////////////////////////// - // "Efficient Belief Propagation for Early Vision" - // P.Felzenszwalb - - class CV_EXPORTS StereoBeliefPropagation - { - public: - enum { DEFAULT_NDISP = 64 }; - enum { DEFAULT_ITERS = 5 }; - enum { DEFAULT_LEVELS = 5 }; - - static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels); - - //! the default constructor - explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, - int iters = DEFAULT_ITERS, - int levels = DEFAULT_LEVELS, - int msg_type = CV_32F); - - //! the full constructor taking the number of disparities, number of BP iterations on each level, - //! number of levels, truncation of data cost, data weight, - //! truncation of discontinuity cost and discontinuity single jump - //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term) - //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term) - //! please see paper for more details - StereoBeliefPropagation(int ndisp, int iters, int levels, - float max_data_term, float data_weight, - float max_disc_term, float disc_single_jump, - int msg_type = CV_32F); - - //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, - //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); - - - //! version for user specified data term - void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null()); - - int ndisp; - - int iters; - int levels; - - float max_data_term; - float data_weight; - float max_disc_term; - float disc_single_jump; - - int msg_type; - private: - GpuMat u, d, l, r, u2, d2, l2, r2; - std::vector datas; - GpuMat out; - }; - - /////////////////////////// StereoConstantSpaceBP /////////////////////////// - // "A Constant-Space Belief Propagation Algorithm for Stereo Matching" - // Qingxiong Yang, Liang Wang, Narendra Ahuja - // http://vision.ai.uiuc.edu/~qyang6/ - - class CV_EXPORTS StereoConstantSpaceBP - { - public: - enum { DEFAULT_NDISP = 128 }; - enum { DEFAULT_ITERS = 8 }; - enum { DEFAULT_LEVELS = 4 }; - enum { DEFAULT_NR_PLANE = 4 }; - - static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane); - - //! the default constructor - explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, - int iters = DEFAULT_ITERS, - int levels = DEFAULT_LEVELS, - int nr_plane = DEFAULT_NR_PLANE, - int msg_type = CV_32F); - - //! the full constructor taking the number of disparities, number of BP iterations on each level, - //! number of levels, number of active disparity on the first level, truncation of data cost, data weight, - //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold - StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, - float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, - int min_disp_th = 0, - int msg_type = CV_32F); - - //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, - //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). - void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); - - int ndisp; - - int iters; - int levels; - - int nr_plane; - - float max_data_term; - float data_weight; - float max_disc_term; - float disc_single_jump; - - int min_disp_th; - - int msg_type; - - bool use_local_init_data_cost; - private: - GpuMat u[2], d[2], l[2], r[2]; - GpuMat disp_selected_pyr[2]; - - GpuMat data_cost; - GpuMat data_cost_selected; - - GpuMat temp; - - GpuMat out; - }; - - /////////////////////////// DisparityBilateralFilter /////////////////////////// - // Disparity map refinement using joint bilateral filtering given a single color image. - // Qingxiong Yang, Liang Wang, Narendra Ahuja - // http://vision.ai.uiuc.edu/~qyang6/ - - class CV_EXPORTS DisparityBilateralFilter - { - public: - enum { DEFAULT_NDISP = 64 }; - enum { DEFAULT_RADIUS = 3 }; - enum { DEFAULT_ITERS = 1 }; +struct CV_EXPORTS CannyBuf; - //! the default constructor - explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS); +CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& image, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); +CV_EXPORTS void Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& edges, double low_thresh, double high_thresh, bool L2gradient = false); - //! the full constructor taking the number of disparities, filter radius, - //! number of iterations, truncation of data continuity, truncation of disparity continuity - //! and filter range sigma - DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range); +struct CV_EXPORTS CannyBuf +{ + CannyBuf() {} + explicit CannyBuf(const Size& image_size, int apperture_size = 3) {create(image_size, apperture_size);} + CannyBuf(const GpuMat& dx_, const GpuMat& dy_); - //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image. - //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type. - void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null()); + void create(const Size& image_size, int apperture_size = 3); + + void release(); - private: - int ndisp; - int radius; - int iters; + GpuMat dx, dy; + GpuMat dx_buf, dy_buf; + GpuMat edgeBuf; + GpuMat trackBuf1, trackBuf2; + Ptr filterDX, filterDY; +}; - float edge_threshold; - float max_disc_threshold; - float sigma_range; +////////////////////////////// Matrix reductions ////////////////////////////// - GpuMat table_color; - GpuMat table_space; - }; +//! computes mean value and standard deviation of all or selected array elements +//! supports only CV_8UC1 type +CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev); + +//! computes norm of array +//! supports NORM_INF, NORM_L1, NORM_L2 +//! supports all matrices except 64F +CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2); + +//! computes norm of array +//! supports NORM_INF, NORM_L1, NORM_L2 +//! supports all matrices except 64F +CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf); + +//! computes norm of the difference between two arrays +//! supports NORM_INF, NORM_L1, NORM_L2 +//! supports only CV_8UC1 type +CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2); + +//! computes sum of array elements +//! supports only single channel images +CV_EXPORTS Scalar sum(const GpuMat& src); + +//! computes sum of array elements +//! supports only single channel images +CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf); + +//! computes sum of array elements absolute values +//! supports only single channel images +CV_EXPORTS Scalar absSum(const GpuMat& src); + +//! computes sum of array elements absolute values +//! supports only single channel images +CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf); + +//! computes squared sum of array elements +//! supports only single channel images +CV_EXPORTS Scalar sqrSum(const GpuMat& src); + +//! computes squared sum of array elements +//! supports only single channel images +CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf); + +//! finds global minimum and maximum array elements and returns their values +CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat()); + +//! finds global minimum and maximum array elements and returns their values +CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf); + +//! finds global minimum and maximum array elements and returns their values with locations +CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, + const GpuMat& mask=GpuMat()); + +//! finds global minimum and maximum array elements and returns their values with locations +CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf); + +//! counts non-zero array elements +CV_EXPORTS int countNonZero(const GpuMat& src); + +//! counts non-zero array elements +CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf); + +//! reduces a matrix to a vector +CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null()); + + +///////////////////////////// Calibration 3D ////////////////////////////////// + +CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, + GpuMat& dst, Stream& stream = Stream::Null()); + +CV_EXPORTS void projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, + const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, + Stream& stream = Stream::Null()); + +CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, + const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, + int num_iters=100, float max_dist=8.0, int min_inlier_count=100, + std::vector* inliers=NULL); + +//////////////////////////////// Image Labeling //////////////////////////////// + +//!performs labeling via graph cuts +CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, + GpuMat& buf, Stream& stream = Stream::Null()); + +////////////////////////////////// Histograms ////////////////////////////////// + +//! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type. +CV_EXPORTS void evenLevels(GpuMat& levels, int nLevels, int lowerLevel, int upperLevel); +//! Calculates histogram with evenly distributed bins for signle channel source. +//! Supports CV_8UC1, CV_16UC1 and CV_16SC1 source types. +//! Output hist will have one row and histSize cols and CV_32SC1 type. +CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); +CV_EXPORTS void histEven(const GpuMat& src, GpuMat& hist, GpuMat& buf, int histSize, int lowerLevel, int upperLevel, Stream& stream = Stream::Null()); +//! Calculates histogram with evenly distributed bins for four-channel source. +//! All channels of source are processed separately. +//! Supports CV_8UC4, CV_16UC4 and CV_16SC4 source types. +//! Output hist[i] will have one row and histSize[i] cols and CV_32SC1 type. +CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); +CV_EXPORTS void histEven(const GpuMat& src, GpuMat hist[4], GpuMat& buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream = Stream::Null()); +//! Calculates histogram with bins determined by levels array. +//! levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. +//! Supports CV_8UC1, CV_16UC1, CV_16SC1 and CV_32FC1 source types. +//! Output hist will have one row and (levels.cols-1) cols and CV_32SC1 type. +CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, Stream& stream = Stream::Null()); +CV_EXPORTS void histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream = Stream::Null()); +//! Calculates histogram with bins determined by levels array. +//! All levels must have one row and CV_32SC1 type if source has integer type or CV_32FC1 otherwise. +//! All channels of source are processed separately. +//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types. +//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type. +CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null()); +CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], GpuMat& buf, Stream& stream = Stream::Null()); + +//! Calculates histogram for 8u one channel image +//! Output hist will have one row, 256 cols and CV32SC1 type. +CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null()); +CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); + +//! normalizes the grayscale image brightness and contrast by normalizing its histogram +CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); +CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream& stream = Stream::Null()); +CV_EXPORTS void equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null()); + +//////////////////////////////// StereoBM_GPU //////////////////////////////// + +class CV_EXPORTS StereoBM_GPU +{ +public: + enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 }; + enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 }; - //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector ////////////// + //! the default constructor + StereoBM_GPU(); + //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8. + StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ); - struct CV_EXPORTS HOGDescriptor - { - enum { DEFAULT_WIN_SIGMA = -1 }; - enum { DEFAULT_NLEVELS = 64 }; - enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL }; + //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair + //! Output disparity has CV_8U type. + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); - HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), - Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), - int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, - double threshold_L2hys=0.2, bool gamma_correction=true, - int nlevels=DEFAULT_NLEVELS); + //! Some heuristics that tries to estmate + // if current GPU will be faster than CPU in this algorithm. + // It queries current active device. + static bool checkIfGpuCallReasonable(); - size_t getDescriptorSize() const; - size_t getBlockHistogramSize() const; + int preset; + int ndisp; + int winSize; - void setSVMDetector(const vector& detector); + // If avergeTexThreshold == 0 => post procesing is disabled + // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image + // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold + // i.e. input left image is low textured. + float avergeTexThreshold; - static vector getDefaultPeopleDetector(); - static vector getPeopleDetector48x96(); - static vector getPeopleDetector64x128(); +private: + GpuMat minSSD, leBuf, riBuf; +}; - void detect(const GpuMat& img, vector& found_locations, - double hit_threshold=0, Size win_stride=Size(), - Size padding=Size()); +////////////////////////// StereoBeliefPropagation /////////////////////////// +// "Efficient Belief Propagation for Early Vision" +// P.Felzenszwalb - void detectMultiScale(const GpuMat& img, vector& found_locations, - double hit_threshold=0, Size win_stride=Size(), - Size padding=Size(), double scale0=1.05, - int group_threshold=2); +class CV_EXPORTS StereoBeliefPropagation +{ +public: + enum { DEFAULT_NDISP = 64 }; + enum { DEFAULT_ITERS = 5 }; + enum { DEFAULT_LEVELS = 5 }; + + static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels); + + //! the default constructor + explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int msg_type = CV_32F); + + //! the full constructor taking the number of disparities, number of BP iterations on each level, + //! number of levels, truncation of data cost, data weight, + //! truncation of discontinuity cost and discontinuity single jump + //! DataTerm = data_weight * min(fabs(I2-I1), max_data_term) + //! DiscTerm = min(disc_single_jump * fabs(f1-f2), max_disc_term) + //! please see paper for more details + StereoBeliefPropagation(int ndisp, int iters, int levels, + float max_data_term, float data_weight, + float max_disc_term, float disc_single_jump, + int msg_type = CV_32F); + + //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, + //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); + + + //! version for user specified data term + void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null()); + + int ndisp; + + int iters; + int levels; + + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; + + int msg_type; +private: + GpuMat u, d, l, r, u2, d2, l2, r2; + std::vector datas; + GpuMat out; +}; + +/////////////////////////// StereoConstantSpaceBP /////////////////////////// +// "A Constant-Space Belief Propagation Algorithm for Stereo Matching" +// Qingxiong Yang, Liang Wang, Narendra Ahuja +// http://vision.ai.uiuc.edu/~qyang6/ + +class CV_EXPORTS StereoConstantSpaceBP +{ +public: + enum { DEFAULT_NDISP = 128 }; + enum { DEFAULT_ITERS = 8 }; + enum { DEFAULT_LEVELS = 4 }; + enum { DEFAULT_NR_PLANE = 4 }; - void getDescriptors(const GpuMat& img, Size win_stride, - GpuMat& descriptors, - int descr_format=DESCR_FORMAT_COL_BY_COL); + static void estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane); - Size win_size; - Size block_size; - Size block_stride; - Size cell_size; - int nbins; - double win_sigma; - double threshold_L2hys; - bool gamma_correction; - int nlevels; + //! the default constructor + explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int nr_plane = DEFAULT_NR_PLANE, + int msg_type = CV_32F); - protected: - void computeBlockHistograms(const GpuMat& img); - void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle); + //! the full constructor taking the number of disparities, number of BP iterations on each level, + //! number of levels, number of active disparity on the first level, truncation of data cost, data weight, + //! truncation of discontinuity cost, discontinuity single jump and minimum disparity threshold + StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, + float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, + int min_disp_th = 0, + int msg_type = CV_32F); - double getWinSigma() const; - bool checkDetectorSize() const; + //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair, + //! if disparity is empty output type will be CV_16S else output type will be disparity.type(). + void operator()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null()); - static int numPartsWithin(int size, int part_size, int stride); - static Size numPartsWithin(Size size, Size part_size, Size stride); + int ndisp; - // Coefficients of the separating plane - float free_coef; - GpuMat detector; - - // Results of the last classification step - GpuMat labels, labels_buf; - Mat labels_host; - - // Results of the last histogram evaluation step - GpuMat block_hists, block_hists_buf; - - // Gradients conputation results - GpuMat grad, qangle, grad_buf, qangle_buf; - - // returns subbuffer with required size, reallocates buffer if nessesary. - static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf); - static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf); - - std::vector image_scales; - }; - - - ////////////////////////////////// BruteForceMatcher ////////////////////////////////// - - class CV_EXPORTS BruteForceMatcher_GPU_base - { - public: - enum DistType {L1Dist = 0, L2Dist, HammingDist}; - - explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); - - // Add descriptors to train descriptor collection - void add(const std::vector& descCollection); - - // Get train descriptors collection - const std::vector& getTrainDescriptors() const; - - // Clear train descriptors collection - void clear(); - - // Return true if there are not train descriptors in collection - bool empty() const; - - // Return true if the matcher supports mask in match methods - bool isMaskSupported() const; - - // Find one best match for each query descriptor - void matchSingle(const GpuMat& query, const GpuMat& train, - GpuMat& trainIdx, GpuMat& distance, - const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - - // Download trainIdx and distance and convert it to CPU vector with DMatch - static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector& matches); - // Convert trainIdx and distance to vector with DMatch - static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches); - - // Find one best match for each query descriptor - void match(const GpuMat& query, const GpuMat& train, std::vector& matches, const GpuMat& mask = GpuMat()); - - // Make gpu collection of trains and masks in suitable format for matchCollection function - void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector& masks = std::vector()); - - // Find one best match from train collection for each query descriptor - void matchCollection(const GpuMat& query, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, - const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null()); - - // Download trainIdx, imgIdx and distance and convert it to vector with DMatch - static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector& matches); - // Convert trainIdx, imgIdx and distance to vector with DMatch - static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector& matches); - - // Find one best match from train collection for each query descriptor. - void match(const GpuMat& query, std::vector& matches, const std::vector& masks = std::vector()); - - // Find k best matches for each query descriptor (in increasing order of distances) - void knnMatchSingle(const GpuMat& query, const GpuMat& train, - GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, - const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - - // Download trainIdx and distance and convert it to vector with DMatch - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, - std::vector< std::vector >& matches, bool compactResult = false); - // Convert trainIdx and distance to vector with DMatch - static void knnMatchConvert(const Mat& trainIdx, const Mat& distance, - std::vector< std::vector >& matches, bool compactResult = false); - - // Find k best matches for each query descriptor (in increasing order of distances). - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const GpuMat& query, const GpuMat& train, - std::vector< std::vector >& matches, int k, const GpuMat& mask = GpuMat(), - bool compactResult = false); - - // Find k best matches from train collection for each query descriptor (in increasing order of distances) - void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection, - GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, - const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null()); - - // Download trainIdx and distance and convert it to vector with DMatch - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, - std::vector< std::vector >& matches, bool compactResult = false); - // Convert trainIdx and distance to vector with DMatch - static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, - std::vector< std::vector >& matches, bool compactResult = false); - - // Find k best matches for each query descriptor (in increasing order of distances). - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - void knnMatch(const GpuMat& query, std::vector< std::vector >& matches, int k, - const std::vector& masks = std::vector(), bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance. - // nMatches.at(0, queryIdx) will contain matches count for queryIdx. - // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, - // because it didn't have enough memory. - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10), - // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches - // Matches doesn't sorted. - void radiusMatchSingle(const GpuMat& query, const GpuMat& train, - GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, - const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); - - // Download trainIdx, nMatches and distance and convert it to vector with DMatch. - // matches will be sorted in increasing order of distances. - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, - std::vector< std::vector >& matches, bool compactResult = false); - // Convert trainIdx, nMatches and distance to vector with DMatch. - static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, - std::vector< std::vector >& matches, bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance - // in increasing order of distances). - void radiusMatch(const GpuMat& query, const GpuMat& train, - std::vector< std::vector >& matches, float maxDistance, - const GpuMat& mask = GpuMat(), bool compactResult = false); - - // Find best matches for each query descriptor which have distance less than maxDistance. - // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10), - // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches - // Matches doesn't sorted. - void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, - const std::vector& masks = std::vector(), Stream& stream = Stream::Null()); - - // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. - // matches will be sorted in increasing order of distances. - // compactResult is used when mask is not empty. If compactResult is false matches - // vector will have the same size as queryDescriptors rows. If compactResult is true - // matches vector will not contain matches for fully masked out query descriptors. - static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, - std::vector< std::vector >& matches, bool compactResult = false); - // Convert trainIdx, nMatches and distance to vector with DMatch. - static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, - std::vector< std::vector >& matches, bool compactResult = false); - - // Find best matches from train collection for each query descriptor which have distance less than - // maxDistance (in increasing order of distances). - void radiusMatch(const GpuMat& query, std::vector< std::vector >& matches, float maxDistance, - const std::vector& masks = std::vector(), bool compactResult = false); - - DistType distType; - - private: - std::vector trainDescCollection; - }; - - template - class CV_EXPORTS BruteForceMatcher_GPU; - - template - class CV_EXPORTS BruteForceMatcher_GPU< L1 > : public BruteForceMatcher_GPU_base - { - public: - explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L1Dist) {} - explicit BruteForceMatcher_GPU(L1 /*d*/) : BruteForceMatcher_GPU_base(L1Dist) {} - }; - template - class CV_EXPORTS BruteForceMatcher_GPU< L2 > : public BruteForceMatcher_GPU_base - { - public: - explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {} - explicit BruteForceMatcher_GPU(L2 /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {} - }; - template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base - { - public: - explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {} - explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {} - }; - - ////////////////////////////////// CascadeClassifier_GPU ////////////////////////////////////////// - // The cascade classifier class for object detection. - class CV_EXPORTS CascadeClassifier_GPU - { - public: - CascadeClassifier_GPU(); - CascadeClassifier_GPU(const string& filename); - ~CascadeClassifier_GPU(); - - bool empty() const; - bool load(const string& filename); - void release(); - - /* returns number of detected objects */ - int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size()); - - bool findLargestObject; - bool visualizeInPlace; - - Size getClassifierSize() const; - private: - - struct CascadeClassifierImpl; - CascadeClassifierImpl* impl; - }; - - ////////////////////////////////// SURF ////////////////////////////////////////// - - class CV_EXPORTS SURF_GPU : public CvSURFParams - { - public: - enum KeypointLayout - { - SF_X = 0, - SF_Y, - SF_LAPLACIAN, - SF_SIZE, - SF_DIR, - SF_HESSIAN, - SF_FEATURE_STRIDE - }; - - //! the default constructor - SURF_GPU(); - //! the full constructor taking all the necessary parameters - explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4, - int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false); - - //! returns the descriptor size in float's (64 or 128) - int descriptorSize() const; - - //! upload host keypoints to device memory - void uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU); - //! download keypoints from device to host memory - void downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints); - - //! download descriptors from device to host memory - void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); - - //! finds the keypoints using fast hessian detector used in SURF - //! supports CV_8UC1 images - //! keypoints will have nFeature cols and 6 rows - //! keypoints.ptr(SF_X)[i] will contain x coordinate of i'th feature - //! keypoints.ptr(SF_Y)[i] will contain y coordinate of i'th feature - //! keypoints.ptr(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature - //! keypoints.ptr(SF_SIZE)[i] will contain size of i'th feature - //! keypoints.ptr(SF_DIR)[i] will contain orientation of i'th feature - //! keypoints.ptr(SF_HESSIAN)[i] will contain response of i'th feature - void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); - //! finds the keypoints and computes their descriptors. - //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction - void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false); - - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints); - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, GpuMat& descriptors, - bool useProvidedKeypoints = false); - - void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, std::vector& descriptors, - bool useProvidedKeypoints = false); - - void releaseMemory(); - - //! max keypoints = min(keypointsRatio * img.size().area(), 65535) - float keypointsRatio; - - GpuMat sum, mask1, maskSum, intBuffer; - - GpuMat det, trace; - - GpuMat maxPosBuffer; - }; - - ////////////////////////////////// Optical Flow ////////////////////////////////////////// - - class CV_EXPORTS BroxOpticalFlow - { - public: - BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) : - alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), - inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_) - { - } - - //! Compute optical flow - //! frame0 - source frame (supports only CV_32FC1 type) - //! frame1 - frame to track (with the same size and type as frame0) - //! u - flow horizontal component (along x axis) - //! v - flow vertical component (along y axis) - void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null()); - - //! flow smoothness - float alpha; - - //! gradient constancy importance - float gamma; - - //! pyramid scale factor - float scale_factor; - - //! number of lagged non-linearity iterations (inner loop) - int inner_iterations; - - //! number of warping iterations (number of pyramid levels) - int outer_iterations; - - //! number of linear system solver iterations - int solver_iterations; - - GpuMat buf; - }; - - //! Interpolate frames (images) using provided optical flow (displacement field). - //! frame0 - frame 0 (32-bit floating point images, single channel) - //! frame1 - frame 1 (the same type and size) - //! fu - forward horizontal displacement - //! fv - forward vertical displacement - //! bu - backward horizontal displacement - //! bv - backward vertical displacement - //! pos - new frame position - //! newFrame - new frame - //! buf - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat; - //! occlusion masks 0, occlusion masks 1, - //! interpolated forward flow 0, interpolated forward flow 1, - //! interpolated backward flow 0, interpolated backward flow 1 - //! - CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, - const GpuMat& fu, const GpuMat& fv, - const GpuMat& bu, const GpuMat& bv, - float pos, GpuMat& newFrame, GpuMat& buf, - Stream& stream = Stream::Null()); + int iters; + int levels; - } + int nr_plane; - //! Speckle filtering - filters small connected components on diparity image. - //! It sets pixel (x,y) to newVal if it coresponds to small CC with size < maxSpeckleSize. - //! Threshold for border between CC is diffThreshold; - CV_EXPORTS void filterSpeckles( Mat& img, uchar newVal, int maxSpeckleSize, uchar diffThreshold, Mat& buf); + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; -} -#include "opencv2/gpu/matrix_operations.hpp" + int min_disp_th; + + int msg_type; + + bool use_local_init_data_cost; +private: + GpuMat u[2], d[2], l[2], r[2]; + GpuMat disp_selected_pyr[2]; + + GpuMat data_cost; + GpuMat data_cost_selected; + + GpuMat temp; + + GpuMat out; +}; + +/////////////////////////// DisparityBilateralFilter /////////////////////////// +// Disparity map refinement using joint bilateral filtering given a single color image. +// Qingxiong Yang, Liang Wang, Narendra Ahuja +// http://vision.ai.uiuc.edu/~qyang6/ + +class CV_EXPORTS DisparityBilateralFilter +{ +public: + enum { DEFAULT_NDISP = 64 }; + enum { DEFAULT_RADIUS = 3 }; + enum { DEFAULT_ITERS = 1 }; + + //! the default constructor + explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS); + + //! the full constructor taking the number of disparities, filter radius, + //! number of iterations, truncation of data continuity, truncation of disparity continuity + //! and filter range sigma + DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range); + + //! the disparity map refinement operator. Refine disparity map using joint bilateral filtering given a single color image. + //! disparity must have CV_8U or CV_16S type, image must have CV_8UC1 or CV_8UC3 type. + void operator()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null()); + +private: + int ndisp; + int radius; + int iters; + + float edge_threshold; + float max_disc_threshold; + float sigma_range; + + GpuMat table_color; + GpuMat table_space; +}; + + +//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector ////////////// + +struct CV_EXPORTS HOGDescriptor +{ + enum { DEFAULT_WIN_SIGMA = -1 }; + enum { DEFAULT_NLEVELS = 64 }; + enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL }; + + HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), + Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), + int nbins=9, double win_sigma=DEFAULT_WIN_SIGMA, + double threshold_L2hys=0.2, bool gamma_correction=true, + int nlevels=DEFAULT_NLEVELS); + + size_t getDescriptorSize() const; + size_t getBlockHistogramSize() const; + + void setSVMDetector(const vector& detector); + + static vector getDefaultPeopleDetector(); + static vector getPeopleDetector48x96(); + static vector getPeopleDetector64x128(); + + void detect(const GpuMat& img, vector& found_locations, + double hit_threshold=0, Size win_stride=Size(), + Size padding=Size()); + + void detectMultiScale(const GpuMat& img, vector& found_locations, + double hit_threshold=0, Size win_stride=Size(), + Size padding=Size(), double scale0=1.05, + int group_threshold=2); + + void getDescriptors(const GpuMat& img, Size win_stride, + GpuMat& descriptors, + int descr_format=DESCR_FORMAT_COL_BY_COL); + + Size win_size; + Size block_size; + Size block_stride; + Size cell_size; + int nbins; + double win_sigma; + double threshold_L2hys; + bool gamma_correction; + int nlevels; + +protected: + void computeBlockHistograms(const GpuMat& img); + void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle); + + double getWinSigma() const; + bool checkDetectorSize() const; + + static int numPartsWithin(int size, int part_size, int stride); + static Size numPartsWithin(Size size, Size part_size, Size stride); + + // Coefficients of the separating plane + float free_coef; + GpuMat detector; + + // Results of the last classification step + GpuMat labels, labels_buf; + Mat labels_host; + + // Results of the last histogram evaluation step + GpuMat block_hists, block_hists_buf; + + // Gradients conputation results + GpuMat grad, qangle, grad_buf, qangle_buf; + + // returns subbuffer with required size, reallocates buffer if nessesary. + static GpuMat getBuffer(const Size& sz, int type, GpuMat& buf); + static GpuMat getBuffer(int rows, int cols, int type, GpuMat& buf); + + std::vector image_scales; +}; + + +////////////////////////////////// BruteForceMatcher ////////////////////////////////// + +class CV_EXPORTS BruteForceMatcher_GPU_base +{ +public: + enum DistType {L1Dist = 0, L2Dist, HammingDist}; + + explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist); + + // Add descriptors to train descriptor collection + void add(const std::vector& descCollection); + + // Get train descriptors collection + const std::vector& getTrainDescriptors() const; + + // Clear train descriptors collection + void clear(); + + // Return true if there are not train descriptors in collection + bool empty() const; + + // Return true if the matcher supports mask in match methods + bool isMaskSupported() const; + + // Find one best match for each query descriptor + void matchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx and distance and convert it to CPU vector with DMatch + static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector& matches); + // Convert trainIdx and distance to vector with DMatch + static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector& matches); + + // Find one best match for each query descriptor + void match(const GpuMat& query, const GpuMat& train, std::vector& matches, const GpuMat& mask = GpuMat()); + + // Make gpu collection of trains and masks in suitable format for matchCollection function + void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const std::vector& masks = std::vector()); + + // Find one best match from train collection for each query descriptor + void matchCollection(const GpuMat& query, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& masks = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx, imgIdx and distance and convert it to vector with DMatch + static void matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector& matches); + // Convert trainIdx, imgIdx and distance to vector with DMatch + static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector& matches); + + // Find one best match from train collection for each query descriptor. + void match(const GpuMat& query, std::vector& matches, const std::vector& masks = std::vector()); + + // Find k best matches for each query descriptor (in increasing order of distances) + void knnMatchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx and distance and convert it to vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatchConvert(const Mat& trainIdx, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const GpuMat& query, const GpuMat& train, + std::vector< std::vector >& matches, int k, const GpuMat& mask = GpuMat(), + bool compactResult = false); + + // Find k best matches from train collection for each query descriptor (in increasing order of distances) + void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection, + GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, + const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx and distance and convert it to vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const GpuMat& query, std::vector< std::vector >& matches, int k, + const std::vector& masks = std::vector(), bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance. + // nMatches.at(0, queryIdx) will contain matches count for queryIdx. + // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, + // because it didn't have enough memory. + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10), + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches + // Matches doesn't sorted. + void radiusMatchSingle(const GpuMat& query, const GpuMat& train, + GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null()); + + // Download trainIdx, nMatches and distance and convert it to vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance + // in increasing order of distances). + void radiusMatch(const GpuMat& query, const GpuMat& train, + std::vector< std::vector >& matches, float maxDistance, + const GpuMat& mask = GpuMat(), bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance. + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10), + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches + // Matches doesn't sorted. + void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, + const std::vector& masks = std::vector(), Stream& stream = Stream::Null()); + + // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, + std::vector< std::vector >& matches, bool compactResult = false); + + // Find best matches from train collection for each query descriptor which have distance less than + // maxDistance (in increasing order of distances). + void radiusMatch(const GpuMat& query, std::vector< std::vector >& matches, float maxDistance, + const std::vector& masks = std::vector(), bool compactResult = false); + + DistType distType; + +private: + std::vector trainDescCollection; +}; + +template +class CV_EXPORTS BruteForceMatcher_GPU; + +template +class CV_EXPORTS BruteForceMatcher_GPU< L1 > : public BruteForceMatcher_GPU_base +{ +public: + explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L1Dist) {} + explicit BruteForceMatcher_GPU(L1 /*d*/) : BruteForceMatcher_GPU_base(L1Dist) {} +}; +template +class CV_EXPORTS BruteForceMatcher_GPU< L2 > : public BruteForceMatcher_GPU_base +{ +public: + explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {} + explicit BruteForceMatcher_GPU(L2 /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {} +}; +template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base +{ +public: + explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {} + explicit BruteForceMatcher_GPU(Hamming /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {} +}; + +////////////////////////////////// CascadeClassifier_GPU ////////////////////////////////////////// +// The cascade classifier class for object detection. +class CV_EXPORTS CascadeClassifier_GPU +{ +public: + CascadeClassifier_GPU(); + CascadeClassifier_GPU(const std::string& filename); + ~CascadeClassifier_GPU(); + + bool empty() const; + bool load(const std::string& filename); + void release(); + + /* returns number of detected objects */ + int detectMultiScale(const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size()); + + bool findLargestObject; + bool visualizeInPlace; + + Size getClassifierSize() const; +private: + + struct CascadeClassifierImpl; + CascadeClassifierImpl* impl; +}; + +////////////////////////////////// SURF ////////////////////////////////////////// + +class CV_EXPORTS SURF_GPU : public CvSURFParams +{ +public: + enum KeypointLayout + { + SF_X = 0, + SF_Y, + SF_LAPLACIAN, + SF_SIZE, + SF_DIR, + SF_HESSIAN, + SF_FEATURE_STRIDE + }; + + //! the default constructor + SURF_GPU(); + //! the full constructor taking all the necessary parameters + explicit SURF_GPU(double _hessianThreshold, int _nOctaves=4, + int _nOctaveLayers=2, bool _extended=false, float _keypointsRatio=0.01f, bool _upright = false); + + //! returns the descriptor size in float's (64 or 128) + int descriptorSize() const; + + //! upload host keypoints to device memory + void uploadKeypoints(const vector& keypoints, GpuMat& keypointsGPU); + //! download keypoints from device to host memory + void downloadKeypoints(const GpuMat& keypointsGPU, vector& keypoints); + + //! download descriptors from device to host memory + void downloadDescriptors(const GpuMat& descriptorsGPU, vector& descriptors); + + //! finds the keypoints using fast hessian detector used in SURF + //! supports CV_8UC1 images + //! keypoints will have nFeature cols and 6 rows + //! keypoints.ptr(SF_X)[i] will contain x coordinate of i'th feature + //! keypoints.ptr(SF_Y)[i] will contain y coordinate of i'th feature + //! keypoints.ptr(SF_LAPLACIAN)[i] will contain laplacian sign of i'th feature + //! keypoints.ptr(SF_SIZE)[i] will contain size of i'th feature + //! keypoints.ptr(SF_DIR)[i] will contain orientation of i'th feature + //! keypoints.ptr(SF_HESSIAN)[i] will contain response of i'th feature + void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints); + //! finds the keypoints and computes their descriptors. + //! Optionally it can compute descriptors for the user-provided keypoints and recompute keypoints direction + void operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints = false); + + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints); + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, GpuMat& descriptors, + bool useProvidedKeypoints = false); + + void operator()(const GpuMat& img, const GpuMat& mask, std::vector& keypoints, std::vector& descriptors, + bool useProvidedKeypoints = false); + + void releaseMemory(); + + //! max keypoints = min(keypointsRatio * img.size().area(), 65535) + float keypointsRatio; + + GpuMat sum, mask1, maskSum, intBuffer; + + GpuMat det, trace; + + GpuMat maxPosBuffer; +}; + +////////////////////////////////// Optical Flow ////////////////////////////////////////// + +class CV_EXPORTS BroxOpticalFlow +{ +public: + BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) : + alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_), + inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_) + { + } + + //! Compute optical flow + //! frame0 - source frame (supports only CV_32FC1 type) + //! frame1 - frame to track (with the same size and type as frame0) + //! u - flow horizontal component (along x axis) + //! v - flow vertical component (along y axis) + void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null()); + + //! flow smoothness + float alpha; + + //! gradient constancy importance + float gamma; + + //! pyramid scale factor + float scale_factor; + + //! number of lagged non-linearity iterations (inner loop) + int inner_iterations; + + //! number of warping iterations (number of pyramid levels) + int outer_iterations; + + //! number of linear system solver iterations + int solver_iterations; + + GpuMat buf; +}; + +//! Interpolate frames (images) using provided optical flow (displacement field). +//! frame0 - frame 0 (32-bit floating point images, single channel) +//! frame1 - frame 1 (the same type and size) +//! fu - forward horizontal displacement +//! fv - forward vertical displacement +//! bu - backward horizontal displacement +//! bv - backward vertical displacement +//! pos - new frame position +//! newFrame - new frame +//! buf - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 GpuMat; +//! occlusion masks 0, occlusion masks 1, +//! interpolated forward flow 0, interpolated forward flow 1, +//! interpolated backward flow 0, interpolated backward flow 1 +//! +CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1, + const GpuMat& fu, const GpuMat& fv, + const GpuMat& bu, const GpuMat& bv, + float pos, GpuMat& newFrame, GpuMat& buf, + Stream& stream = Stream::Null()); + +} // namespace gpu + +//! Speckle filtering - filters small connected components on diparity image. +//! It sets pixel (x,y) to newVal if it coresponds to small CC with size < maxSpeckleSize. +//! Threshold for border between CC is diffThreshold; +CV_EXPORTS void filterSpeckles(Mat& img, uchar newVal, int maxSpeckleSize, uchar diffThreshold, Mat& buf); + +} // namespace cv #endif /* __OPENCV_GPU_HPP__ */ diff --git a/modules/gpu/include/opencv2/gpu/gpumat.hpp b/modules/gpu/include/opencv2/gpu/gpumat.hpp index e36a94a17ae14fc77681b5e08c3384257214eafd..3baff61281050f6fa9611bef8ec77c9359e9eb52 100644 --- a/modules/gpu/include/opencv2/gpu/gpumat.hpp +++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp @@ -40,427 +40,4 @@ // //M*/ -#ifndef __OPENCV_GPUMAT_HPP__ -#define __OPENCV_GPUMAT_HPP__ - -#include "opencv2/core/core.hpp" -#include "opencv2/gpu/devmem2d.hpp" - -namespace cv { namespace gpu -{ - //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. - class CV_EXPORTS GpuMat - { - public: - //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code. - // Contains just image size, data ptr and step. - template operator DevMem2D_() const; - template operator PtrStep_() const; - template operator PtrStep() const; - - - - - - //! builds GpuMat from Mat. Perfom blocking upload to device. - explicit GpuMat(const Mat& m); - - //! pefroms blocking upload data to GpuMat. - void upload(const Mat& m); - - //! downloads data from device to host memory. Blocking calls. - void download(Mat& m) const; - operator Mat() const - { - Mat m; - download(m); - return m; - } - - - - - - - //! default constructor - GpuMat(); - - //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.) - GpuMat(int rows, int cols, int type); - GpuMat(Size size, int type); - - //! constucts GpuMatrix and fills it with the specified value _s. - GpuMat(int rows, int cols, int type, const Scalar& s); - GpuMat(Size size, int type, const Scalar& s); - - //! copy constructor - GpuMat(const GpuMat& m); - - //! constructor for GpuMatrix headers pointing to user-allocated data - GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP); - GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP); - - //! creates a matrix header for a part of the bigger matrix - GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange); - GpuMat(const GpuMat& m, const Rect& roi); - - //! destructor - calls release() - ~GpuMat(); - - //! assignment operators - GpuMat& operator = (const GpuMat& m); - - //! returns a new GpuMatrix header for the specified row - GpuMat row(int y) const; - //! returns a new GpuMatrix header for the specified column - GpuMat col(int x) const; - //! ... for the specified row span - GpuMat rowRange(int startrow, int endrow) const; - GpuMat rowRange(const Range& r) const; - //! ... for the specified column span - GpuMat colRange(int startcol, int endcol) const; - GpuMat colRange(const Range& r) const; - - //! returns deep copy of the GpuMatrix, i.e. the data is copied - GpuMat clone() const; - //! copies the GpuMatrix content to "m". - // It calls m.create(this->size(), this->type()). - void copyTo(GpuMat& m) const; - //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements. - void copyTo(GpuMat& m, const GpuMat& mask) const; - //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale. - void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const; - - void assignTo(GpuMat& m, int type=-1) const; - - //! sets every GpuMatrix element to s - GpuMat& operator = (const Scalar& s); - //! sets some of the GpuMatrix elements to s, according to the mask - GpuMat& setTo(const Scalar& s, const GpuMat& mask = GpuMat()); - //! creates alternative GpuMatrix header for the same data, with different - // number of channels and/or different number of rows. see cvReshape. - GpuMat reshape(int cn, int rows = 0) const; - - //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type. - // previous data is unreferenced if needed. - void create(int rows, int cols, int type); - void create(Size size, int type); - //! decreases reference counter; - // deallocate the data when reference counter reaches 0. - void release(); - - //! swaps with other smart pointer - void swap(GpuMat& mat); - - //! locates GpuMatrix header within a parent GpuMatrix. See below - void locateROI(Size& wholeSize, Point& ofs) const; - //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix. - GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright); - //! extracts a rectangular sub-GpuMatrix - // (this is a generalized form of row, rowRange etc.) - GpuMat operator()(Range rowRange, Range colRange) const; - GpuMat operator()(const Rect& roi) const; - - //! returns true iff the GpuMatrix data is continuous - // (i.e. when there are no gaps between successive rows). - // similar to CV_IS_GpuMat_CONT(cvGpuMat->type) - bool isContinuous() const; - //! returns element size in bytes, - // similar to CV_ELEM_SIZE(cvMat->type) - size_t elemSize() const; - //! returns the size of element channel in bytes. - size_t elemSize1() const; - //! returns element type, similar to CV_MAT_TYPE(cvMat->type) - int type() const; - //! returns element type, similar to CV_MAT_DEPTH(cvMat->type) - int depth() const; - //! returns element type, similar to CV_MAT_CN(cvMat->type) - int channels() const; - //! returns step/elemSize1() - size_t step1() const; - //! returns GpuMatrix size: - // width == number of columns, height == number of rows - Size size() const; - //! returns true if GpuMatrix data is NULL - bool empty() const; - - //! returns pointer to y-th row - uchar* ptr(int y = 0); - const uchar* ptr(int y = 0) const; - - //! template version of the above method - template _Tp* ptr(int y = 0); - template const _Tp* ptr(int y = 0) const; - - /*! includes several bit-fields: - - the magic signature - - continuity flag - - depth - - number of channels - */ - int flags; - - //! the number of rows and columns - int rows, cols; - - //! a distance between successive rows in bytes; includes the gap if any - size_t step; - - //! pointer to the data - uchar* data; - - //! pointer to the reference counter; - // when GpuMatrix points to user-allocated data, the pointer is NULL - int* refcount; - - //! helper fields used in locateROI and adjustROI - uchar* datastart; - uchar* dataend; - }; - - //! Creates continuous GPU matrix - CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); - CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type); - CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m); - CV_EXPORTS GpuMat createContinuous(Size size, int type); - - //! Ensures that size of the given matrix is not less than (rows, cols) size - //! and matrix type is match specified one too - CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); - CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m); - - //////////////////////////////////////////////////////////////////////// - - template inline GpuMat::operator DevMem2D_() const { return DevMem2D_(rows, cols, (T*)data, step); } - template inline GpuMat::operator PtrStep_() const { return PtrStep_(static_cast< DevMem2D_ >(*this)); } - template inline GpuMat::operator PtrStep() const { return PtrStep((T*)data, step); } - - - - - - - inline GpuMat::GpuMat() - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) - { - } - - inline GpuMat::GpuMat(int rows_, int cols_, int type_) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) - { - if (rows_ > 0 && cols_ > 0) - create(rows_, cols_, type_); - } - - inline GpuMat::GpuMat(Size size_, int type_) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) - { - if (size_.height > 0 && size_.width > 0) - create(size_.height, size_.width, type_); - } - - inline GpuMat::GpuMat(int rows_, int cols_, int type_, const Scalar& s_) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) - { - if (rows_ > 0 && cols_ > 0) - { - create(rows_, cols_, type_); - setTo(s_); - } - } - - inline GpuMat::GpuMat(Size size_, int type_, const Scalar& s_) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) - { - if (size_.height > 0 && size_.width > 0) - { - create(size_.height, size_.width, type_); - setTo(s_); - } - } - - inline GpuMat::~GpuMat() - { - release(); - } - - inline GpuMat GpuMat::clone() const - { - GpuMat m; - copyTo(m); - return m; - } - - inline void GpuMat::assignTo(GpuMat& m, int type) const - { - if (type < 0) - m = *this; - else - convertTo(m, type); - } - - inline size_t GpuMat::step1() const - { - return step / elemSize1(); - } - - inline bool GpuMat::empty() const - { - return data == 0; - } - - template inline _Tp* GpuMat::ptr(int y) - { - return (_Tp*)ptr(y); - } - - template inline const _Tp* GpuMat::ptr(int y) const - { - return (const _Tp*)ptr(y); - } - - inline void swap(GpuMat& a, GpuMat& b) - { - a.swap(b); - } - - inline GpuMat GpuMat::row(int y) const - { - return GpuMat(*this, Range(y, y+1), Range::all()); - } - - inline GpuMat GpuMat::col(int x) const - { - return GpuMat(*this, Range::all(), Range(x, x+1)); - } - - inline GpuMat GpuMat::rowRange(int startrow, int endrow) const - { - return GpuMat(*this, Range(startrow, endrow), Range::all()); - } - - inline GpuMat GpuMat::rowRange(const Range& r) const - { - return GpuMat(*this, r, Range::all()); - } - - inline GpuMat GpuMat::colRange(int startcol, int endcol) const - { - return GpuMat(*this, Range::all(), Range(startcol, endcol)); - } - - inline GpuMat GpuMat::colRange(const Range& r) const - { - return GpuMat(*this, Range::all(), r); - } - - inline void GpuMat::create(Size size_, int type_) - { - create(size_.height, size_.width, type_); - } - - inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const - { - return GpuMat(*this, rowRange, colRange); - } - - inline GpuMat GpuMat::operator()(const Rect& roi) const - { - return GpuMat(*this, roi); - } - - inline bool GpuMat::isContinuous() const - { - return (flags & Mat::CONTINUOUS_FLAG) != 0; - } - - inline size_t GpuMat::elemSize() const - { - return CV_ELEM_SIZE(flags); - } - - inline size_t GpuMat::elemSize1() const - { - return CV_ELEM_SIZE1(flags); - } - - inline int GpuMat::type() const - { - return CV_MAT_TYPE(flags); - } - - inline int GpuMat::depth() const - { - return CV_MAT_DEPTH(flags); - } - - inline int GpuMat::channels() const - { - return CV_MAT_CN(flags); - } - - inline Size GpuMat::size() const - { - return Size(cols, rows); - } - - inline unsigned char* GpuMat::ptr(int y) - { - CV_DbgAssert((unsigned)y < (unsigned)rows); - return data + step * y; - } - - inline const unsigned char* GpuMat::ptr(int y) const - { - CV_DbgAssert((unsigned)y < (unsigned)rows); - return data + step * y; - } - - inline GpuMat& GpuMat::operator = (const Scalar& s) - { - setTo(s); - return *this; - } - - inline GpuMat createContinuous(int rows, int cols, int type) - { - GpuMat m; - createContinuous(rows, cols, type, m); - return m; - } - - inline void createContinuous(Size size, int type, GpuMat& m) - { - createContinuous(size.height, size.width, type, m); - } - - inline GpuMat createContinuous(Size size, int type) - { - GpuMat m; - createContinuous(size, type, m); - return m; - } - - inline void ensureSizeIsEnough(Size size, int type, GpuMat& m) - { - ensureSizeIsEnough(size.height, size.width, type, m); - } - - inline void createContinuous(int rows, int cols, int type, GpuMat& m) - { - int area = rows * cols; - if (!m.isContinuous() || m.type() != type || m.size().area() != area) - m.create(1, area, type); - m = m.reshape(0, rows); - } - - inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m) - { - if (m.type() == type && m.rows >= rows && m.cols >= cols) - m = m(Rect(0, 0, cols, rows)); - else - m.create(rows, cols, type); - } -}} - -#endif // __OPENCV_GPUMAT_HPP__ +#include "opencv2/core/gpumat.hpp" diff --git a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp deleted file mode 100644 index 5a6b1bbfc41c3a327078cbaacc49412bd3de9cbb..0000000000000000000000000000000000000000 --- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other GpuMaterials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ -#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ - -namespace cv -{ - -namespace gpu -{ -/////////////////////////////////////////////////////////////////////// -//////////////////////////////// CudaMem //////////////////////////////// -/////////////////////////////////////////////////////////////////////// - -inline CudaMem::CudaMem() : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {} -inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( _rows > 0 && _cols > 0 ) - create( _rows, _cols, _type, _alloc_type); -} - -inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( _size.height > 0 && _size.width > 0 ) - create( _size.height, _size.width, _type, _alloc_type); -} - -inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type) -{ - if( refcount ) - CV_XADD(refcount, 1); -} - -inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( m.rows > 0 && m.cols > 0 ) - create( m.size(), m.type(), _alloc_type); - - Mat tmp = createMatHeader(); - m.copyTo(tmp); -} - -inline CudaMem::~CudaMem() -{ - release(); - -} - -inline CudaMem& CudaMem::operator = (const CudaMem& m) -{ - if( this != &m ) - { - if( m.refcount ) - CV_XADD(m.refcount, 1); - release(); - flags = m.flags; - rows = m.rows; cols = m.cols; - step = m.step; data = m.data; - datastart = m.datastart; - dataend = m.dataend; - refcount = m.refcount; - alloc_type = m.alloc_type; - } - return *this; -} - -inline CudaMem CudaMem::clone() const -{ - CudaMem m(size(), type(), alloc_type); - Mat to = m; - Mat from = *this; - from.copyTo(to); - return m; -} - -inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); } - - -//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type); -//CPP void CudaMem::release(); - -inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data, step); } -inline CudaMem::operator Mat() const { return createMatHeader(); } - -inline CudaMem::operator GpuMat() const { return createGpuMatHeader(); } -//CPP GpuMat CudaMem::createGpuMatHeader() const; - -inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; } -inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); } -inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); } -inline int CudaMem::type() const { return CV_MAT_TYPE(flags); } -inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); } -inline int CudaMem::channels() const { return CV_MAT_CN(flags); } -inline size_t CudaMem::step1() const { return step/elemSize1(); } -inline Size CudaMem::size() const { return Size(cols, rows); } -inline bool CudaMem::empty() const { return data == 0; } - -} /* end of namespace gpu */ - -} /* end of namespace cv */ - -#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */ diff --git a/modules/gpu/perf/perf_arithm.cpp b/modules/gpu/perf/perf_arithm.cpp index 8e34023bb6fa2bbf481845c1800f2a02b6b5435e..d740388ffca81140e828652ca861882b26321fce 100644 --- a/modules/gpu/perf/perf_arithm.cpp +++ b/modules/gpu/perf/perf_arithm.cpp @@ -24,7 +24,7 @@ PERF_TEST_P(DevInfo_Size_MatType, transpose, testing::Combine(testing::ValuesIn( transpose(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -55,7 +55,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, flip, testing::Combine(testing::Value flip(src, dst, flipCode); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_Size_MatType, LUT, testing::Combine(testing::ValuesIn(device LUT(src, lut, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -115,8 +115,8 @@ PERF_TEST_P(DevInfo_Size, cartToPolar, testing::Combine(testing::ValuesIn(device cartToPolar(x, y, magnitude, angle); } - Mat magnitude_host = magnitude; - Mat angle_host = angle; + Mat magnitude_host(magnitude); + Mat angle_host(angle); SANITY_CHECK(magnitude_host); SANITY_CHECK(angle_host); @@ -147,8 +147,8 @@ PERF_TEST_P(DevInfo_Size, polarToCart, testing::Combine(testing::ValuesIn(device polarToCart(magnitude, angle, x, y); } - Mat x_host = x; - Mat y_host = angle; + Mat x_host(x); + Mat y_host(y); SANITY_CHECK(x_host); SANITY_CHECK(y_host); @@ -180,7 +180,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addMat, testing::Combine(testing::ValuesIn(dev add(a, b, c); } - Mat c_host = c; + Mat c_host(c); SANITY_CHECK(c_host); } @@ -210,7 +210,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addScalar, testing::Combine(testing::ValuesIn( add(a, b, c); } - Mat c_host = c; + Mat c_host(c); SANITY_CHECK(c_host); } @@ -241,7 +241,7 @@ PERF_TEST_P(DevInfo_Size_MatType, subtractMat, testing::Combine(testing::ValuesI subtract(a, b, c); } - Mat c_host = c; + Mat c_host(c); SANITY_CHECK(c_host); } @@ -270,7 +270,7 @@ PERF_TEST_P(DevInfo_Size, multiplyMat, testing::Combine(testing::ValuesIn(device multiply(a, b, c); } - Mat c_host = c; + Mat c_host(c); SANITY_CHECK(c_host); } @@ -300,7 +300,7 @@ PERF_TEST_P(DevInfo_Size_MatType, multiplyScalar, testing::Combine(testing::Valu multiply(a, b, c); } - Mat c_host = c; + Mat c_host(c); SANITY_CHECK(c_host); } @@ -327,7 +327,7 @@ PERF_TEST_P(DevInfo_Size, exp, testing::Combine(testing::ValuesIn(devices()), exp(a, b); } - Mat b_host = b; + Mat b_host(b); SANITY_CHECK(b_host); } @@ -356,7 +356,7 @@ PERF_TEST_P(DevInfo_Size_MatType, pow, testing::Combine(testing::ValuesIn(device pow(src, 2.0, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -389,7 +389,7 @@ PERF_TEST_P(DevInfo_Size_MatType_CmpOp, compare, testing::Combine(testing::Value compare(src1, src2, dst, cmpop); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -418,7 +418,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_not, testing::Combine(testing::ValuesI bitwise_not(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -449,7 +449,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_and, testing::Combine(testing::ValuesI bitwise_and(src1, src2, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -480,7 +480,7 @@ PERF_TEST_P(DevInfo_Size_MatType, min, testing::Combine(testing::ValuesIn(device min(src1, src2, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -712,7 +712,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI addWeighted(src1, 0.5, src2, 0.5, 0.0, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -743,7 +743,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val reduce(src, dst, dim, CV_REDUCE_MIN); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -774,7 +774,7 @@ PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()), gemm(src1, src2, 1.0, src3, 1.0, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } diff --git a/modules/gpu/perf/perf_calib3d.cpp b/modules/gpu/perf/perf_calib3d.cpp index e84f87b3bace17768e7ecc95d566620f0e6d1f41..4ac922e99bc0b57589a77ccb7d942fd9707d804d 100644 --- a/modules/gpu/perf/perf_calib3d.cpp +++ b/modules/gpu/perf/perf_calib3d.cpp @@ -20,7 +20,7 @@ PERF_TEST_P(DevInfo, transformPoints, testing::ValuesIn(devices())) transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -45,7 +45,7 @@ PERF_TEST_P(DevInfo, projectPoints, testing::ValuesIn(devices())) projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp index 55125f31bf78243998b77a04a0d8d82d9af2a1f4..da813bff624bdfea869629385579877ef9929a04 100644 --- a/modules/gpu/perf/perf_filters.cpp +++ b/modules/gpu/perf/perf_filters.cpp @@ -28,7 +28,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing filter->apply(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing:: filter->apply(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -96,7 +96,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test filter->apply(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -130,7 +130,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Com filter->apply(src, dst, Rect(0, 0, src.cols, src.rows)); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp index 3f7264125e1a8a94b0aa5c9b1855c6f397333555..7b8ffe4d46d4bc67ad959b040e4fdcbc107fde13 100644 --- a/modules/gpu/perf/perf_imgproc.cpp +++ b/modules/gpu/perf/perf_imgproc.cpp @@ -36,7 +36,7 @@ PERF_TEST_P(DevInfo_Size_MatType_Interpolation_BorderMode, remap, testing::Combi remap(src, dst, xmap, ymap, interpolation, borderMode); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo, meanShiftFiltering, testing::ValuesIn(devices())) meanShiftFiltering(src, dst, 50, 50); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -91,8 +91,8 @@ PERF_TEST_P(DevInfo, meanShiftProc, testing::ValuesIn(devices())) meanShiftProc(src, dstr, dstsp, 50, 50); } - Mat dstr_host = dstr; - Mat dstsp_host = dstsp; + Mat dstr_host(dstr); + Mat dstsp_host(dstsp); SANITY_CHECK(dstr_host); SANITY_CHECK(dstsp_host); diff --git a/modules/gpu/perf/perf_matop.cpp b/modules/gpu/perf/perf_matop.cpp index d1505f5611a6896b46334fb25de56345cf4dd226..ba66a0c6c070124c80dab48e0b798e68a85e9c24 100644 --- a/modules/gpu/perf/perf_matop.cpp +++ b/modules/gpu/perf/perf_matop.cpp @@ -25,7 +25,7 @@ PERF_TEST_P(DevInfo_Size_MatType, merge, testing::Combine(testing::ValuesIn(devi merge(src, dst); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -82,7 +82,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setTo, testing::Combine(testing::ValuesIn(devi src.setTo(val); } - Mat src_host = src; + Mat src_host(src); SANITY_CHECK(src_host); } @@ -115,7 +115,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setToMasked, testing::Combine(testing::ValuesI src.setTo(val, mask); } - src_host = src; + src.download(src_host); SANITY_CHECK(src_host); } @@ -148,7 +148,7 @@ PERF_TEST_P(DevInfo_Size_MatType, copyToMasked, testing::Combine(testing::Values src.copyTo(dst, mask); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } @@ -182,7 +182,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MatType, convertTo, testing::Combine(testing::V src.convertTo(dst, type2, a, b); } - Mat dst_host = dst; + Mat dst_host(dst); SANITY_CHECK(dst_host); } diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index fabb3dfbc7d87f7ead684a544ef92670cd0888eb..a47d222910094bcc590a577ece2032630ae007cc 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -425,16 +425,22 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream) //////////////////////////////////////////////////////////////////////// // Polar <-> Cart -namespace cv { namespace gpu { namespace mathfunc +BEGIN_OPENCV_DEVICE_NAMESPACE + +namespace mathfunc { - void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream); - void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream); -}}} + void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream); + void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream); +} + +END_OPENCV_DEVICE_NAMESPACE namespace { inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream) { + using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc; + CV_DbgAssert(x.size() == y.size() && x.type() == y.type()); CV_Assert(x.depth() == CV_32F); @@ -448,11 +454,13 @@ namespace GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat(); GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat(); - mathfunc::cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream); + cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream); } inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream) { + using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc; + CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type()); CV_Assert(mag.depth() == CV_32F); @@ -464,34 +472,33 @@ namespace GpuMat x1cn = x.reshape(1); GpuMat y1cn = y.reshape(1); - mathfunc::polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream); + polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream); } } void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) { - ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); + cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream)); } void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream) { - ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); + cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream)); } void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream) { - ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); + cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); } void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream) { - ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); + cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream)); } void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream) { - ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); + polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream)); } - #endif /* !defined (HAVE_CUDA) */ diff --git a/modules/gpu/src/bilateral_filter.cpp b/modules/gpu/src/bilateral_filter.cpp index bc2bec24a084fda927e5571c221e4e6a93d82525..12c159a483ed6413e3a0b4bd89f5a90b2b11a7b9 100644 --- a/modules/gpu/src/bilateral_filter.cpp +++ b/modules/gpu/src/bilateral_filter.cpp @@ -55,13 +55,19 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, #else /* !defined (HAVE_CUDA) */ -namespace cv { namespace gpu { namespace bf +BEGIN_OPENCV_DEVICE_NAMESPACE + +namespace bilateral_filter { - void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc); + void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc); + + void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); + void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream); +} + +END_OPENCV_DEVICE_NAMESPACE - void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream); - void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream); -}}} +using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter; namespace { @@ -105,7 +111,7 @@ namespace short edge_disc = max(short(1), short(ndisp * edge_threshold + 0.5)); short max_disc = short(ndisp * max_disc_threshold + 0.5); - bf::load_constants(table_color.ptr(), table_space, ndisp, radius, edge_disc, max_disc); + load_constants(table_color.ptr(), table_space, ndisp, radius, edge_disc, max_disc); if (&dst != &disp) { @@ -115,7 +121,7 @@ namespace disp.copyTo(dst); } - bf::bilateral_filter_gpu((DevMem2D_)dst, img, img.channels(), iters, StreamAccessor::getStream(stream)); + bilateral_filter_gpu((DevMem2D_)dst, img, img.channels(), iters, StreamAccessor::getStream(stream)); } typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, diff --git a/modules/gpu/src/blend.cpp b/modules/gpu/src/blend.cpp index 54345f616330135ac4039641e39eb1e7bb5a6571..4c4afc5c71cd643cc8d54bd4d98754dcda287079 100644 --- a/modules/gpu/src/blend.cpp +++ b/modules/gpu/src/blend.cpp @@ -52,15 +52,19 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu #else -namespace cv { namespace gpu +BEGIN_OPENCV_DEVICE_NAMESPACE + +namespace blend { template - void blendLinearCaller(int rows, int cols, int cn, const PtrStep& img1, const PtrStep& img2, - const PtrStepf& weights1, const PtrStepf& weights2, PtrStep result, cudaStream_t stream); + void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream); + + void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream); +} + +END_OPENCV_DEVICE_NAMESPACE - void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, - const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream); -}} +using namespace OPENCV_DEVICE_NAMESPACE_ blend; void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, GpuMat& result, Stream& stream) diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp index 4f9bd5cd720986f08592f3b5457ee153734a8c44..1d9314603880f553e95309143b02df1f7999c6bd 100644 --- a/modules/gpu/src/brute_force_matcher.cpp +++ b/modules/gpu/src/brute_force_matcher.cpp @@ -82,7 +82,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec #else /* !defined (HAVE_CUDA) */ -namespace cv { namespace gpu { namespace bf_match +BEGIN_OPENCV_DEVICE_NAMESPACE + +namespace bf_match { template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, @@ -103,9 +105,9 @@ namespace cv { namespace gpu { namespace bf_match template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -}}} +} -namespace cv { namespace gpu { namespace bf_knnmatch +namespace bf_knnmatch { template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, @@ -126,9 +128,9 @@ namespace cv { namespace gpu { namespace bf_knnmatch template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -}}} +} -namespace cv { namespace gpu { namespace bf_radius_match +namespace bf_radius_match { template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, @@ -151,15 +153,17 @@ namespace cv { namespace gpu { namespace bf_radius_match template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -}}} - -cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) -{ } +END_OPENCV_DEVICE_NAMESPACE + //////////////////////////////////////////////////////////////////// // Train collection +cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_) +{ +} + void cv::gpu::BruteForceMatcher_GPU_base::add(const vector& descCollection) { trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end()); @@ -195,7 +199,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const if (query.empty() || train.empty()) return; - using namespace cv::gpu::bf_match; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, @@ -242,8 +246,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, if (trainIdx.empty() || distance.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat distanceCPU = distance; + Mat trainIdxCPU(trainIdx); + Mat distanceCPU(distance); matchConvert(trainIdxCPU, distanceCPU, matches); } @@ -337,7 +341,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c if (query.empty() || trainCollection.empty()) return; - using namespace cv::gpu::bf_match; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, @@ -384,9 +388,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, if (trainIdx.empty() || imgIdx.empty() || distance.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat imgIdxCPU = imgIdx; - Mat distanceCPU = distance; + Mat trainIdxCPU(trainIdx); + Mat imgIdxCPU(imgIdx); + Mat distanceCPU(distance); matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches); } @@ -448,7 +452,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co if (query.empty() || train.empty()) return; - using namespace cv::gpu::bf_knnmatch; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, @@ -511,8 +515,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId if (trainIdx.empty() || distance.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat distanceCPU = distance; + Mat trainIdxCPU(trainIdx); + Mat distanceCPU(distance); knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult); } @@ -577,7 +581,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer if (query.empty() || trainCollection.empty()) return; - using namespace cv::gpu::bf_knnmatch; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, @@ -630,9 +634,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainI if (trainIdx.empty() || imgIdx.empty() || distance.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat imgIdxCPU = imgIdx; - Mat distanceCPU = distance; + Mat trainIdxCPU(trainIdx); + Mat imgIdxCPU(imgIdx); + Mat distanceCPU(distance); knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult); } @@ -758,7 +762,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query, if (query.empty() || train.empty()) return; - using namespace cv::gpu::bf_radius_match; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, @@ -819,9 +823,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai if (trainIdx.empty() || distance.empty() || nMatches.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat distanceCPU = distance; - Mat nMatchesCPU = nMatches; + Mat trainIdxCPU(trainIdx); + Mat distanceCPU(distance); + Mat nMatchesCPU(nMatches); radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); } @@ -889,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu if (query.empty() || empty()) return; - using namespace cv::gpu::bf_radius_match; + using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match; typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, @@ -953,10 +957,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty()) return; - Mat trainIdxCPU = trainIdx; - Mat imgIdxCPU = imgIdx; - Mat distanceCPU = distance; - Mat nMatchesCPU = nMatches; + Mat trainIdxCPU(trainIdx); + Mat imgIdxCPU(imgIdx); + Mat distanceCPU(distance); + Mat nMatchesCPU(nMatches); radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult); } diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp index 301ea8167ec3d621e4cf469d7fe3d7515f7dd8cd..8e6e8389d9595effbba411d882f0debcf302d54d 100644 --- a/modules/gpu/src/calib3d.cpp +++ b/modules/gpu/src/calib3d.cpp @@ -42,6 +42,10 @@ #include "precomp.hpp" +using namespace cv; +using namespace cv::gpu; +using namespace std; + #if !defined(HAVE_CUDA) void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } @@ -52,13 +56,31 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat #else -using namespace cv; -using namespace cv::gpu; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace transform_points +namespace transform_points { void call(const DevMem2D_ src, const float* rot, const float* transl, DevMem2D_ dst, cudaStream_t stream); -}}} +} + +namespace project_points +{ + void call(const DevMem2D_ src, const float* rot, const float* transl, const float* proj, DevMem2D_ dst, cudaStream_t stream); +} + +namespace solve_pnp_ransac +{ + int maxNumIters(); + + void computeHypothesisScores( + const int num_hypotheses, const int num_points, const float* rot_matrices, + const float3* transl_vectors, const float3* object, const float2* image, + const float dist_threshold, int* hypothesis_scores); +} + +END_OPENCV_DEVICE_NAMESPACE + +using namespace OPENCV_DEVICE_NAMESPACE; namespace { @@ -79,15 +101,9 @@ namespace void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream) { - ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); + transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream)); } -namespace cv { namespace gpu { namespace project_points -{ - void call(const DevMem2D_ src, const float* rot, const float* transl, const float* proj, DevMem2D_ dst, cudaStream_t stream); -}}} - - namespace { void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream) @@ -109,20 +125,9 @@ namespace void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream) { - ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); + projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream)); } - -namespace cv { namespace gpu { namespace solve_pnp_ransac -{ - int maxNumIters(); - - void computeHypothesisScores( - const int num_hypotheses, const int num_points, const float* rot_matrices, - const float3* transl_vectors, const float3* object, const float2* image, - const float dist_threshold, int* hypothesis_scores); -}}} - namespace { // Selects subset_size random different points from [0, num_points - 1] range diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 0af5fa252fb64d854fb9fa8bc4d50d6e5a215e6a..a6b7da9f7995589b102b26f2b9cf37858a60a7ea 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -46,7 +46,6 @@ using namespace cv; using namespace cv::gpu; using namespace std; - #if !defined (HAVE_CUDA) cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() { throw_nogpu(); } diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp index 69b00300a032e7414459a9bd9eccb01ef80f83ea..c4f8b609ce19c524e6dfcacd324823d506a51436 100644 --- a/modules/gpu/src/color.cpp +++ b/modules/gpu/src/color.cpp @@ -51,155 +51,158 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu( #else /* !defined (HAVE_CUDA) */ -namespace cv { namespace gpu { namespace device -{ - #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \ - void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); - - #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) - - #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \ - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba) - - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565) - - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565) - - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray) - OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4) - - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4) - - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra) - - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4) - - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr) - OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra) - - #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE - #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL - #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F -}}} +BEGIN_OPENCV_DEVICE_NAMESPACE + +#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \ + void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream); + +#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) + +#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \ + OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba) + +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565) + +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565) + +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray) +OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4) + +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4) + +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra) + +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4) + +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr) +OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra) + +#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE +#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL +#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F + +END_OPENCV_DEVICE_NAMESPACE + +using namespace OPENCV_DEVICE_NAMESPACE; namespace { diff --git a/modules/gpu/src/cuda/bf_knnmatch.cu b/modules/gpu/src/cuda/bf_knnmatch.cu index f53af9e6fae47a2ed55819767bbca92506a57421..c8b1171e22f7bdc9b45f9035baca2a991ad8ec3b 100644 --- a/modules/gpu/src/cuda/bf_knnmatch.cu +++ b/modules/gpu/src/cuda/bf_knnmatch.cu @@ -45,1115 +45,1117 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace bf_knnmatch -{ - /////////////////////////////////////////////////////////////////////////////// - // Reduction +namespace bf_knnmatch { - template - __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - float* s_distance, int* s_trainIdx) - { - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; +/////////////////////////////////////////////////////////////////////////////// +// Reduction + +template +__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + float* s_distance, int* s_trainIdx) +{ + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance1; - s_trainIdx[threadIdx.x] = bestTrainIdx1; + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) + float val = s_distance[i]; + + if (val < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + } + else if (val < myBestDistance2) { - float val = s_distance[i]; - - if (val < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - - myBestDistance1 = val; - myBestTrainIdx1 = s_trainIdx[i]; - } - else if (val < myBestDistance2) - { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - } + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; } } + } - __syncthreads(); + __syncthreads(); - s_distance[threadIdx.x] = bestDistance2; - s_trainIdx[threadIdx.x] = bestTrainIdx2; + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) - { - float val = s_distance[i]; + float val = s_distance[i]; - if (val < myBestDistance2) - { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - } + if (val < myBestDistance2) + { + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; } } + } - bestDistance1 = myBestDistance1; - bestDistance2 = myBestDistance2; + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; - bestTrainIdx1 = myBestTrainIdx1; - bestTrainIdx2 = myBestTrainIdx2; - } + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; +} - template - __device__ void findBestMatch(float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2, - float* s_distance, int* s_trainIdx, int* s_imgIdx) - { - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; +template +__device__ void findBestMatch(float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2, + float* s_distance, int* s_trainIdx, int* s_imgIdx) +{ + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_imgIdx += threadIdx.y * BLOCK_SIZE; + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; - s_distance[threadIdx.x] = bestDistance1; - s_trainIdx[threadIdx.x] = bestTrainIdx1; - s_imgIdx[threadIdx.x] = bestImgIdx1; + s_distance[threadIdx.x] = bestDistance1; + s_trainIdx[threadIdx.x] = bestTrainIdx1; + s_imgIdx[threadIdx.x] = bestImgIdx1; - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) + float val = s_distance[i]; + + if (val < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestImgIdx2 = myBestImgIdx1; + + myBestDistance1 = val; + myBestTrainIdx1 = s_trainIdx[i]; + myBestImgIdx1 = s_imgIdx[i]; + } + else if (val < myBestDistance2) { - float val = s_distance[i]; - - if (val < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestImgIdx2 = myBestImgIdx1; - - myBestDistance1 = val; - myBestTrainIdx1 = s_trainIdx[i]; - myBestImgIdx1 = s_imgIdx[i]; - } - else if (val < myBestDistance2) - { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - myBestImgIdx2 = s_imgIdx[i]; - } + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; } } + } - __syncthreads(); + __syncthreads(); - s_distance[threadIdx.x] = bestDistance2; - s_trainIdx[threadIdx.x] = bestTrainIdx2; - s_imgIdx[threadIdx.x] = bestImgIdx2; + s_distance[threadIdx.x] = bestDistance2; + s_trainIdx[threadIdx.x] = bestTrainIdx2; + s_imgIdx[threadIdx.x] = bestImgIdx2; - __syncthreads(); + __syncthreads(); - if (threadIdx.x == 0) + if (threadIdx.x == 0) + { + #pragma unroll + for (int i = 0; i < BLOCK_SIZE; ++i) { - #pragma unroll - for (int i = 0; i < BLOCK_SIZE; ++i) + float val = s_distance[i]; + + if (val < myBestDistance2) { - float val = s_distance[i]; - - if (val < myBestDistance2) - { - myBestDistance2 = val; - myBestTrainIdx2 = s_trainIdx[i]; - myBestImgIdx2 = s_imgIdx[i]; - } + myBestDistance2 = val; + myBestTrainIdx2 = s_trainIdx[i]; + myBestImgIdx2 = s_imgIdx[i]; } } + } - bestDistance1 = myBestDistance1; - bestDistance2 = myBestDistance2; + bestDistance1 = myBestDistance1; + bestDistance2 = myBestDistance2; - bestTrainIdx1 = myBestTrainIdx1; - bestTrainIdx2 = myBestTrainIdx2; + bestTrainIdx1 = myBestTrainIdx1; + bestTrainIdx2 = myBestTrainIdx2; - bestImgIdx1 = myBestImgIdx1; - bestImgIdx2 = myBestImgIdx2; - } + bestImgIdx1 = myBestImgIdx1; + bestImgIdx2 = myBestImgIdx2; +} - /////////////////////////////////////////////////////////////////////////////// - // Match Unrolled Cached +/////////////////////////////////////////////////////////////////////////////// +// Match Unrolled Cached - template - __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) +template +__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) +{ + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; + } +} + +template +__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + { + Dist dist; + #pragma unroll for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; - } - } - template - __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) - { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + if (loadX < train.cols) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + T val; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - - if (loadX < train.cols) - { - T val; + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + __syncthreads(); - __syncthreads(); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + __syncthreads(); + } - __syncthreads(); - } + typename Dist::result_type distVal = dist; - typename Dist::result_type distVal = dist; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) { - if (distVal < bestDistance1) - { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) - { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; - } + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; } } } +} - template - __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); + loadQueryToSmem(queryIdx, query, s_query); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); + loadQueryToSmem(queryIdx, query, s_query); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // Match Unrolled +/////////////////////////////////////////////////////////////////////////////// +// Match Unrolled - template - __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) +template +__device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + Dist dist; + + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { - Dist dist; + const int loadX = threadIdx.x + i * BLOCK_SIZE; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (loadX < query.cols) + { + T val; - if (loadX < query.cols) - { - T val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + __syncthreads(); - __syncthreads(); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + __syncthreads(); + } - __syncthreads(); - } + typename Dist::result_type distVal = dist; - typename Dist::result_type distVal = dist; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) { - if (distVal < bestDistance1) - { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) - { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; - } + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; } } } +} - template - __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // Match +/////////////////////////////////////////////////////////////////////////////// +// Match - template - __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance1, float& bestDistance2, - int& bestTrainIdx1, int& bestTrainIdx2, - int& bestImgIdx1, int& bestImgIdx2) +template +__device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance1, float& bestDistance2, + int& bestTrainIdx1, int& bestTrainIdx2, + int& bestImgIdx1, int& bestImgIdx2) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) { - Dist dist; + const int loadX = threadIdx.x + i * BLOCK_SIZE; - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + if (loadX < query.cols) + { + T val; - if (loadX < query.cols) - { - T val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + __syncthreads(); - __syncthreads(); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + __syncthreads(); + } - __syncthreads(); - } + typename Dist::result_type distVal = dist; - typename Dist::result_type distVal = dist; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + { + if (distVal < bestDistance1) + { + bestImgIdx2 = bestImgIdx1; + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx)) + bestImgIdx1 = imgIdx; + bestDistance1 = distVal; + bestTrainIdx1 = trainIdx; + } + else if (distVal < bestDistance2) { - if (distVal < bestDistance1) - { - bestImgIdx2 = bestImgIdx1; - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestImgIdx1 = imgIdx; - bestDistance1 = distVal; - bestTrainIdx1 = trainIdx; - } - else if (distVal < bestDistance2) - { - bestImgIdx2 = imgIdx; - bestDistance2 = distVal; - bestTrainIdx2 = trainIdx; - } + bestImgIdx2 = imgIdx; + bestDistance2 = distVal; + bestTrainIdx2 = trainIdx; } } } +} - template - __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; - loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - match<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - float myBestDistance1 = numeric_limits::max(); - float myBestDistance2 = numeric_limits::max(); - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - int myBestImgIdx1 = -1; - int myBestImgIdx2 = -1; + float myBestDistance1 = numeric_limits::max(); + float myBestDistance2 = numeric_limits::max(); + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + int myBestImgIdx1 = -1; + int myBestImgIdx2 = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); - bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); - bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2); + bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2); + bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2); } +} - template - void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2D_& trainIdx, const DevMem2D_& imgIdx, const DevMem2D_& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // knnMatch 2 dispatcher +/////////////////////////////////////////////////////////////////////////////// +// knnMatch 2 dispatcher - template - void match2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) +template +void match2Dispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) { - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ - else - { - match<16, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); - } + matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); } - - template - void match2Dispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) + else if (query.cols <= 128) { - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - }*/ - else - { - match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); - } + matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); } - - /////////////////////////////////////////////////////////////////////////////// - // Calc distance kernel - - template - __global__ void calcDistanceUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) + /*else if (query.cols <= 256) { - extern __shared__ int smem[]; + matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + }*/ + else + { + match<16, Dist>(query, train, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ > (distance), stream); + } +} - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; +template +void match2Dispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) + { + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 128) + { + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + }*/ + else + { + match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_ >(trainIdx), static_cast< DevMem2D_ >(imgIdx), static_cast< DevMem2D_ > (distance), stream); + } +} - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); +/////////////////////////////////////////////////////////////////////////////// +// Calc distance kernel - Dist dist; +template +__global__ void calcDistanceUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) +{ + extern __shared__ int smem[]; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - if (loadX < query.cols) - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; - } - else - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - } + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - __syncthreads(); + Dist dist; - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - __syncthreads(); + if (loadX < query.cols) + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; } - if (queryIdx < query.rows && trainIdx < train.rows) - { - float distVal = numeric_limits::max(); + __syncthreads(); - if (mask(queryIdx, trainIdx)) - distVal = (typename Dist::result_type)dist; + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - allDist.ptr(queryIdx)[trainIdx] = distVal; - } + __syncthreads(); } - template - void calcDistanceUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) + if (queryIdx < query.rows && trainIdx < train.rows) { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + float distVal = numeric_limits::max(); - calcDistanceUnrolled<<>>(query, train, mask, allDist); - cudaSafeCall( cudaGetLastError() ); + if (mask(queryIdx, trainIdx)) + distVal = (typename Dist::result_type)dist; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + allDist.ptr(queryIdx)[trainIdx] = distVal; } +} - template - __global__ void calcDistance(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) - { - extern __shared__ int smem[]; +template +void calcDistanceUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + calcDistanceUnrolled<<>>(query, train, mask, allDist); + cudaSafeCall( cudaGetLastError() ); - Dist dist; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; +template +__global__ void calcDistance(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, PtrStepf allDist) +{ + extern __shared__ int smem[]; - if (loadX < query.cols) - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; - } - else - { - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - } + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - __syncthreads(); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + Dist dist; - __syncthreads(); - } + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - if (queryIdx < query.rows && trainIdx < train.rows) + if (loadX < query.cols) { - float distVal = numeric_limits::max(); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX]; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX]; + } + else + { + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + } - if (mask(queryIdx, trainIdx)) - distVal = (typename Dist::result_type)dist; + __syncthreads(); - allDist.ptr(queryIdx)[trainIdx] = distVal; - } + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + + __syncthreads(); } - template - void calcDistance(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) + if (queryIdx < query.rows && trainIdx < train.rows) { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + float distVal = numeric_limits::max(); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - - calcDistance<<>>(query, train, mask, allDist); - cudaSafeCall( cudaGetLastError() ); + if (mask(queryIdx, trainIdx)) + distVal = (typename Dist::result_type)dist; - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + allDist.ptr(queryIdx)[trainIdx] = distVal; } +} - /////////////////////////////////////////////////////////////////////////////// - // Calc Distance dispatcher +template +void calcDistance(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - template - void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Df& allDist, - int cc, cudaStream_t stream) - { - if (query.cols <= 64) - { - calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 128) - { - calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); - } - /*else if (query.cols <= 256) - { - calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 512) - { - calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); - } - else if (query.cols <= 1024) - { - calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); - }*/ - else - { - calcDistance<16, Dist>(query, train, mask, allDist, stream); - } - } + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - /////////////////////////////////////////////////////////////////////////////// - // find knn match kernel + calcDistance<<>>(query, train, mask, allDist); + cudaSafeCall( cudaGetLastError() ); - template - __global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) - { - const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; - __shared__ float s_dist[SMEM_SIZE]; - __shared__ int s_trainIdx[SMEM_SIZE]; + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - const int queryIdx = blockIdx.x; +/////////////////////////////////////////////////////////////////////////////// +// Calc Distance dispatcher - float* allDistRow = allDist.ptr(queryIdx); +template +void calcDistanceDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Df& allDist, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) + { + calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 128) + { + calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream); + } + /*else if (query.cols <= 256) + { + calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 512) + { + calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream); + } + else if (query.cols <= 1024) + { + calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream); + }*/ + else + { + calcDistance<16, Dist>(query, train, mask, allDist, stream); + } +} - float dist = numeric_limits::max(); - int bestIdx = -1; - - for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) - { - float reg = allDistRow[i]; - if (reg < dist) - { - dist = reg; - bestIdx = i; - } - } +/////////////////////////////////////////////////////////////////////////////// +// find knn match kernel - s_dist[threadIdx.x] = dist; - s_trainIdx[threadIdx.x] = bestIdx; - __syncthreads(); +template +__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance) +{ + const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; + __shared__ float s_dist[SMEM_SIZE]; + __shared__ int s_trainIdx[SMEM_SIZE]; - reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); + const int queryIdx = blockIdx.x; - if (threadIdx.x == 0) - { - if (dist < numeric_limits::max()) - { - allDistRow[bestIdx] = numeric_limits::max(); - trainIdx.ptr(queryIdx)[i] = bestIdx; - distance.ptr(queryIdx)[i] = dist; - } - } - } + float* allDistRow = allDist.ptr(queryIdx); - template - void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) + float dist = numeric_limits::max(); + int bestIdx = -1; + + for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE) { - const dim3 block(BLOCK_SIZE, 1, 1); - const dim3 grid(trainIdx.rows, 1, 1); - - for (int i = 0; i < k; ++i) + float reg = allDistRow[i]; + if (reg < dist) { - findBestMatch<<>>(allDist, i, trainIdx, distance); - cudaSafeCall( cudaGetLastError() ); + dist = reg; + bestIdx = i; } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); } - void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) - { - findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); - } + s_dist[threadIdx.x] = dist; + s_trainIdx[threadIdx.x] = bestIdx; + __syncthreads(); - /////////////////////////////////////////////////////////////////////////////// - // knn match Dispatcher + reducePredVal(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less()); - template - void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const Mask& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) + if (threadIdx.x == 0) { - if (k == 2) - { - match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); - } - else + if (dist < numeric_limits::max()) { - calcDistanceDispatcher(query, train, mask, allDist, cc, stream); - findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); + allDistRow[bestIdx] = numeric_limits::max(); + trainIdx.ptr(queryIdx)[i] = bestIdx; + distance.ptr(queryIdx)[i] = dist; } - } - - /////////////////////////////////////////////////////////////////////////////// - // knn match caller - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) - { - if (mask.data) - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); } +} - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template +void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, 1, 1); + const dim3 grid(trainIdx.rows, 1, 1); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) + for (int i = 0; i < k; ++i) { - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); + findBestMatch<<>>(allDist, i, trainIdx, distance); + cudaSafeCall( cudaGetLastError() ); } - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, - const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, - int cc, cudaStream_t stream) - { - if (mask.data) - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); - else - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); - } +void findKnnMatchDispatcher(int k, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream) +{ + findKnnMatch<256>(k, static_cast(trainIdx), static_cast(distance), allDist, stream); +} - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +/////////////////////////////////////////////////////////////////////////////// +// knn match Dispatcher - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) +template +void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, int k, const Mask& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) +{ + if (k == 2) { - if (masks.data) - match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + match2Dispatcher(query, train, mask, trainIdx, distance, cc, stream); } - - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - - template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) + else { - if (masks.data) - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); + calcDistanceDispatcher(query, train, mask, allDist, cc, stream); + findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream); } - - //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - - template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, - int cc, cudaStream_t stream) - { - if (masks.data) - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); - else - match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); - } - - template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - //template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); - template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); -}}} +} + +/////////////////////////////////////////////////////////////////////////////// +// knn match caller + +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) +{ + if (mask.data) + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); +} + +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) +{ + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); +} + +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, + const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, + int cc, cudaStream_t stream) +{ + if (mask.data) + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream); + else + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream); +} + +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, int k, const DevMem2Db& mask, const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream); + +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); +} + +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2L1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + +template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); +} + +//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Di& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2L2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + +template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream); + else + match2Dispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream); +} + +template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +//template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); +template void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, int cc, cudaStream_t stream); + +} // namespace bf_knnmatch + +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/bf_match.cu b/modules/gpu/src/cuda/bf_match.cu index e46939fa9f032900e639474d992f8865d5c132e8..0ab56be630a2d3b981163fb8f08ee50b6fd6517b 100644 --- a/modules/gpu/src/cuda/bf_match.cu +++ b/modules/gpu/src/cuda/bf_match.cu @@ -45,734 +45,736 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace bf_match +namespace bf_match { + +/////////////////////////////////////////////////////////////////////////////// +// Reduction + +template +__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) { - /////////////////////////////////////////////////////////////////////////////// - // Reduction + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; - template - __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) - { - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; + __syncthreads(); - __syncthreads(); + reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); +} - reducePredVal(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less()); - } +template +__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) +{ + s_distance += threadIdx.y * BLOCK_SIZE; + s_trainIdx += threadIdx.y * BLOCK_SIZE; + s_imgIdx += threadIdx.y * BLOCK_SIZE; - template - __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) - { - s_distance += threadIdx.y * BLOCK_SIZE; - s_trainIdx += threadIdx.y * BLOCK_SIZE; - s_imgIdx += threadIdx.y * BLOCK_SIZE; + s_distance[threadIdx.x] = bestDistance; + s_trainIdx[threadIdx.x] = bestTrainIdx; + s_imgIdx [threadIdx.x] = bestImgIdx; - s_distance[threadIdx.x] = bestDistance; - s_trainIdx[threadIdx.x] = bestTrainIdx; - s_imgIdx [threadIdx.x] = bestImgIdx; + __syncthreads(); - __syncthreads(); + reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); +} - reducePredVal2(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less()); - } +/////////////////////////////////////////////////////////////////////////////// +// Match Unrolled Cached - /////////////////////////////////////////////////////////////////////////////// - // Match Unrolled Cached +template +__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) +{ + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; + s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; + } +} - template - __device__ void loadQueryToSmem(int queryIdx, const DevMem2D_& query, U* s_query) +template +__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { + Dist dist; + #pragma unroll for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) { const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0; - } - } - template - __device__ void loopUnrolledCached(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) - { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + if (loadX < train.cols) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + T val; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - - if (loadX < train.cols) - { - T val; - - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - __syncthreads(); + __syncthreads(); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - __syncthreads(); - } + __syncthreads(); + } - typename Dist::result_type distVal = dist; + typename Dist::result_type distVal = dist; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) - { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; - } + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; } } +} - template - __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); + loadQueryToSmem(queryIdx, query, s_query); - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; - loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + loopUnrolledCached(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolledCached<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolledCached(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); - loadQueryToSmem(queryIdx, query, s_query); + loadQueryToSmem(queryIdx, query, s_query); - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - Mask m = mask; + Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolledCached(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolledCached(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolledCached<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // Match Unrolled +/////////////////////////////////////////////////////////////////////////////// +// Match Unrolled - template - __device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) +template +__device__ void loopUnrolled(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + Dist dist; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - if (loadX < query.cols) - { - T val; + if (loadX < query.cols) + { + T val; - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - __syncthreads(); + __syncthreads(); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - __syncthreads(); - } + __syncthreads(); + } - typename Dist::result_type distVal = dist; + typename Dist::result_type distVal = dist; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) - { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; - } + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; } } +} - template - __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - - loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loopUnrolled(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void matchUnrolled(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - Mask m = mask; - - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + Mask m = mask; + + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loopUnrolled(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // Match +/////////////////////////////////////////////////////////////////////////////// +// Match - template - __device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, - typename Dist::value_type* s_query, typename Dist::value_type* s_train, - float& bestDistance, int& bestTrainIdx, int& bestImgIdx) +template +__device__ void loop(int queryIdx, const DevMem2D_& query, int imgIdx, const DevMem2D_& train, const Mask& mask, + typename Dist::value_type* s_query, typename Dist::value_type* s_train, + float& bestDistance, int& bestTrainIdx, int& bestImgIdx) +{ + for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) { - for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) - { - Dist dist; + Dist dist; - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - if (loadX < query.cols) - { - T val; + if (loadX < query.cols) + { + T val; - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + ForceGlob::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - __syncthreads(); + __syncthreads(); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - __syncthreads(); - } + __syncthreads(); + } - typename Dist::result_type distVal = dist; + typename Dist::result_type distVal = dist; - const int trainIdx = t * BLOCK_SIZE + threadIdx.x; + const int trainIdx = t * BLOCK_SIZE + threadIdx.x; - if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) - { - bestImgIdx = imgIdx; - bestDistance = distVal; - bestTrainIdx = trainIdx; - } + if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) + { + bestImgIdx = imgIdx; + bestDistance = distVal; + bestTrainIdx = trainIdx; } } +} - template - __global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void match(const DevMem2D_ query, const DevMem2D_ train, const Mask mask, int* bestTrainIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - - loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + + loop(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); + findBestMatch(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void match(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - match<<>>(query, train, mask, trainIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + match<<>>(query, train, mask, trainIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, - int* bestTrainIdx, int* bestImgIdx, float* bestDistance) - { - extern __shared__ int smem[]; +template +__global__ void match(const DevMem2D_ query, const DevMem2D_* trains, int n, const Mask mask, + int* bestTrainIdx, int* bestImgIdx, float* bestDistance) +{ + extern __shared__ int smem[]; - const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; + const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; - float myBestDistance = numeric_limits::max(); - int myBestTrainIdx = -1; - int myBestImgIdx = -1; + float myBestDistance = numeric_limits::max(); + int myBestTrainIdx = -1; + int myBestImgIdx = -1; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - Mask m = mask; - for (int imgIdx = 0; imgIdx < n; ++imgIdx) - { - const DevMem2D_ train = trains[imgIdx]; - m.next(); - loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); - } + Mask m = mask; + for (int imgIdx = 0; imgIdx < n; ++imgIdx) + { + const DevMem2D_ train = trains[imgIdx]; + m.next(); + loop(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); + } - __syncthreads(); + __syncthreads(); - float* s_distance = (float*)(smem); - int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); - int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); + float* s_distance = (float*)(smem); + int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); + int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); - findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); + findBestMatch(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); - if (queryIdx < query.rows && threadIdx.x == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestImgIdx[queryIdx] = myBestImgIdx; - bestDistance[queryIdx] = myBestDistance; - } + if (queryIdx < query.rows && threadIdx.x == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestImgIdx[queryIdx] = myBestImgIdx; + bestDistance[queryIdx] = myBestDistance; } +} - template - void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(query.rows, BLOCK_SIZE)); +template +void match(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(query.rows, BLOCK_SIZE)); - const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); - cudaSafeCall( cudaGetLastError() ); + match<<>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - /////////////////////////////////////////////////////////////////////////////// - // Match dispatcher +/////////////////////////////////////////////////////////////////////////////// +// Match dispatcher - template - void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) +template +void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) { - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); - }*/ - else - { - match<16, Dist>(query, train, mask, trainIdx, distance, stream); - } + matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); } - - template - void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) + else if (query.cols <= 128) { - if (query.cols <= 64) - { - matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 128) - { - matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - }*/ - else - { - match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); - } + matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); } - - /////////////////////////////////////////////////////////////////////////////// - // Match caller - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) + /*else if (query.cols <= 256) { - if (mask.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } + matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); } - - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); + }*/ + else { - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } + match<16, Dist>(query, train, mask, trainIdx, distance, stream); } +} - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) +template +void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) { - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), - trainIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), - trainIdx, distance, - cc, stream); - } + matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } - - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) + else if (query.cols <= 128) { - if (masks.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } + matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + }*/ + else + { + match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); + } +} - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +/////////////////////////////////////////////////////////////////////////////// +// Match caller - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (mask.data) { - if (masks.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); } + else + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } +} + +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } +} + +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), SingleMask(mask), + trainIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), WithOutMask(), + trainIdx, distance, + cc, stream); + } +} - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, - int cc, cudaStream_t stream) +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) { - if (masks.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), - trainIdx, imgIdx, distance, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), - trainIdx, imgIdx, distance, - cc, stream); - } + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); } +} + +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } +} + +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, + int cc, cudaStream_t stream) +{ + if (masks.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, MaskCollection(masks.data), + trainIdx, imgIdx, distance, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains.ptr(), trains.cols, WithOutMask(), + trainIdx, imgIdx, distance, + cc, stream); + } +} + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); + +} // namespace bf_match - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int cc, cudaStream_t stream); -}}} +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu index e3500758dd48e89f00dd2a5ec878c9a1d5018b95..519ed7fc0f98a09168e7c779a5e0d9d600d593fe 100644 --- a/modules/gpu/src/cuda/bf_radius_match.cu +++ b/modules/gpu/src/cuda/bf_radius_match.cu @@ -45,421 +45,423 @@ #include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/datamov_utils.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace bf_radius_match +namespace bf_radius_match { + +/////////////////////////////////////////////////////////////////////////////// +// Match Unrolled + +template +__global__ void matchUnrolled(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) { - /////////////////////////////////////////////////////////////////////////////// - // Match Unrolled + #if __CUDA_ARCH__ >= 110 - template - __global__ void matchUnrolled(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, - PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) - { - #if __CUDA_ARCH__ >= 110 + extern __shared__ int smem[]; + + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - extern __shared__ int smem[]; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + Dist dist; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + #pragma unroll + for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - Dist dist; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - #pragma unroll - for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) + if (loadX < query.cols) { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + T val; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - if (loadX < query.cols) - { - T val; + ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + __syncthreads(); - ForceGlob::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - __syncthreads(); + __syncthreads(); + } - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + float distVal = (typename Dist::result_type)dist; - __syncthreads(); + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) + { + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; } + } - float distVal = (typename Dist::result_type)dist; + #endif +} - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) - { - unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (ind < maxCount) - { - bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; - if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; - bestDistance.ptr(queryIdx)[ind] = distVal; - } - } +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - #endif - } + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + matchUnrolled<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); + cudaSafeCall( cudaGetLastError() ); - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - matchUnrolled<<>>(query, 0, train, maxDistance, mask, - trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); - cudaSafeCall( cudaGetLastError() ); +template +void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - template - void matchUnrolled(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) + for (int i = 0; i < n; ++i) { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const DevMem2D_ train = trains[i]; - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - for (int i = 0; i < n; ++i) + if (masks != 0 && masks[i].data) { - const DevMem2D_ train = trains[i]; - - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - - if (masks != 0 && masks[i].data) - { - matchUnrolled<<>>(query, i, train, maxDistance, SingleMask(masks[i]), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); - } - else - { - matchUnrolled<<>>(query, i, train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); - } - cudaSafeCall( cudaGetLastError() ); + matchUnrolled<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); } - - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + else + { + matchUnrolled<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); + } + cudaSafeCall( cudaGetLastError() ); } - /////////////////////////////////////////////////////////////////////////////// - // Match + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - __global__ void match(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, - PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) - { - #if __CUDA_ARCH__ >= 110 +/////////////////////////////////////////////////////////////////////////////// +// Match - extern __shared__ int smem[]; +template +__global__ void match(const DevMem2D_ query, int imgIdx, const DevMem2D_ train, float maxDistance, const Mask mask, + PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) +{ + #if __CUDA_ARCH__ >= 110 - const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; - const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; + extern __shared__ int smem[]; - typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); - typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); + const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; + const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x; - Dist dist; + typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); + typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); - for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) - { - const int loadX = threadIdx.x + i * BLOCK_SIZE; + Dist dist; + + for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) + { + const int loadX = threadIdx.x + i * BLOCK_SIZE; - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; - if (loadX < query.cols) - { - T val; + if (loadX < query.cols) + { + T val; - ForceGlob::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val); - s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; + ForceGlob::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); + s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; - ForceGlob::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); - s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; - } + ForceGlob::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); + s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; + } - __syncthreads(); + __syncthreads(); - #pragma unroll - for (int j = 0; j < BLOCK_SIZE; ++j) - dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); + #pragma unroll + for (int j = 0; j < BLOCK_SIZE; ++j) + dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); - __syncthreads(); - } + __syncthreads(); + } - float distVal = (typename Dist::result_type)dist; + float distVal = (typename Dist::result_type)dist; - if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance) + { + unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); + if (ind < maxCount) { - unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1); - if (ind < maxCount) - { - bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; - if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; - bestDistance.ptr(queryIdx)[ind] = distVal; - } + bestTrainIdx.ptr(queryIdx)[ind] = trainIdx; + if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx; + bestDistance.ptr(queryIdx)[ind] = distVal; } - - #endif } - template - void match(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); + #endif +} - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); +template +void match(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - match<<>>(query, 0, train, maxDistance, mask, - trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); - cudaSafeCall( cudaGetLastError() ); + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + match<<>>(query, 0, train, maxDistance, mask, + trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols); + cudaSafeCall( cudaGetLastError() ); - template - void match(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - cudaStream_t stream) - { - const dim3 block(BLOCK_SIZE, BLOCK_SIZE); + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); +template +void match(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + cudaStream_t stream) +{ + const dim3 block(BLOCK_SIZE, BLOCK_SIZE); - for (int i = 0; i < n; ++i) - { - const DevMem2D_ train = trains[i]; - - const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - - if (masks != 0 && masks[i].data) - { - match<<>>(query, i, train, maxDistance, SingleMask(masks[i]), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); - } - else - { - match<<>>(query, i, train, maxDistance, WithOutMask(), - trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); - } - cudaSafeCall( cudaGetLastError() ); - } + const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + for (int i = 0; i < n; ++i) + { + const DevMem2D_ train = trains[i]; - /////////////////////////////////////////////////////////////////////////////// - // Match dispatcher + const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)); - template - void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) - { - if (query.cols <= 64) - { - matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 128) - { - matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - /*else if (query.cols <= 256) + if (masks != 0 && masks[i].data) { - matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + match<<>>(query, i, train, maxDistance, SingleMask(masks[i]), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); - }*/ else { - match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + match<<>>(query, i, train, maxDistance, WithOutMask(), + trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols); } + cudaSafeCall( cudaGetLastError() ); } - template - void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} + +/////////////////////////////////////////////////////////////////////////////// +// Match dispatcher + +template +void matchDispatcher(const DevMem2D_& query, const DevMem2D_& train, float maxDistance, const Mask& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) { - if (query.cols <= 64) - { - matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 128) - { - matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - /*else if (query.cols <= 256) - { - matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 512) - { - matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - else if (query.cols <= 1024) - { - matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - }*/ - else - { - match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); - } - } - - /////////////////////////////////////////////////////////////////////////////// - // Radius Match caller - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) + matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + else if (query.cols <= 128) { - if (mask.data) - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } + matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } - - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) + /*else if (query.cols <= 256) { - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } + matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + }*/ + else + { + match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream); + } +} - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, - const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) +template +void matchDispatcher(const DevMem2D_& query, const DevMem2D_* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + if (query.cols <= 64) { - if (mask.data) - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), - trainIdx, distance, nMatches, - cc, stream); - } - else - { - matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), - trainIdx, distance, nMatches, - cc, stream); - } + matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); } + else if (query.cols <= 128) + { + matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + /*else if (query.cols <= 256) + { + matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 512) + { + matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } + else if (query.cols <= 1024) + { + matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + }*/ + else + { + match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream); + } +} - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +/////////////////////////////////////////////////////////////////////////////// +// Radius Match caller - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + if (mask.data) { - matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, cc, stream); } - - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) + else { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, cc, stream); } - - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, - const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, - int cc, cudaStream_t stream) +} + +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); + } + else + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, + cc, stream); + } +} + +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL2_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, + const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + if (mask.data) + { + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, SingleMask(mask), + trainIdx, distance, nMatches, + cc, stream); + } + else { - matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, - trainIdx, imgIdx, distance, nMatches, + matchDispatcher(static_cast< DevMem2D_ >(query), static_cast< DevMem2D_ >(train), maxDistance, WithOutMask(), + trainIdx, distance, nMatches, cc, stream); } +} + +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + matchDispatcher< L1Dist >(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); +} + +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); +} + +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, + const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, + int cc, cudaStream_t stream) +{ + matchDispatcher(static_cast< DevMem2D_ >(query), (const DevMem2D_*)trains, n, maxDistance, masks, + trainIdx, imgIdx, distance, nMatches, + cc, stream); +} + +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +//template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); +template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); + +} // namespace bf_radius_match - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - //template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); - template void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_& nMatches, int cc, cudaStream_t stream); -}}} +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/bilateral_filter.cu b/modules/gpu/src/cuda/bilateral_filter.cu index 173c156cf42b4170295ab3dabc7849f2346142eb..4d3d9bcf7789a4f9d6d21e129d670e9d3c68a5cc 100644 --- a/modules/gpu/src/cuda/bilateral_filter.cu +++ b/modules/gpu/src/cuda/bilateral_filter.cu @@ -43,191 +43,186 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/limits.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace bf_krnls +namespace bilateral_filter { + +__constant__ float* ctable_color; +__constant__ float* ctable_space; +__constant__ size_t ctable_space_step; + +__constant__ int cndisp; +__constant__ int cradius; + +__constant__ short cedge_disc; +__constant__ short cmax_disc; + +void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc) { - __constant__ float* ctable_color; - __constant__ float* ctable_space; - __constant__ size_t ctable_space_step; + cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) ); + cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) ); + size_t table_space_step = table_space.step / sizeof(float); + cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) ); - __constant__ int cndisp; - __constant__ int cradius; + cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) ); + cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) ); - __constant__ short cedge_disc; - __constant__ short cmax_disc; + cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) ); + cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) ); } -namespace cv { namespace gpu { namespace bf +template +struct DistRgbMax { - void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc) + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) { - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) ); - size_t table_space_step = table_space.step / sizeof(float); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) ); - - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) ); - cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) ); + uchar x = ::abs(a[0] - b[0]); + uchar y = ::abs(a[1] - b[1]); + uchar z = ::abs(a[2] - b[2]); + return (::max(::max(x, y), z)); } -}}} +}; -namespace bf_krnls +template <> +struct DistRgbMax<1> { - template - struct DistRgbMax + static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) { - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) - { - uchar x = abs(a[0] - b[0]); - uchar y = abs(a[1] - b[1]); - uchar z = abs(a[2] - b[2]); - return (max(max(x, y), z)); - } - }; + return ::abs(a[0] - b[0]); + } +}; - template <> - struct DistRgbMax<1> - { - static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b) - { - return abs(a[0] - b[0]); - } - }; +template +__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) +{ + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); - template - __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w) - { - const int y = blockIdx.y * blockDim.y + threadIdx.y; - const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1); + T dp[5]; - T dp[5]; + if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + { + dp[0] = *(disp + (y ) * disp_step + x + 0); + dp[1] = *(disp + (y-1) * disp_step + x + 0); + dp[2] = *(disp + (y ) * disp_step + x - 1); + dp[3] = *(disp + (y+1) * disp_step + x + 0); + dp[4] = *(disp + (y ) * disp_step + x + 1); - if (y > 0 && y < h - 1 && x > 0 && x < w - 1) + if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc) { - dp[0] = *(disp + (y ) * disp_step + x + 0); - dp[1] = *(disp + (y-1) * disp_step + x + 0); - dp[2] = *(disp + (y ) * disp_step + x - 1); - dp[3] = *(disp + (y+1) * disp_step + x + 0); - dp[4] = *(disp + (y ) * disp_step + x + 1); + const int ymin = ::max(0, y - cradius); + const int xmin = ::max(0, x - cradius); + const int ymax = ::min(h - 1, y + cradius); + const int xmax = ::min(w - 1, x + cradius); - if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc) - { - const int ymin = max(0, y - cradius); - const int xmin = max(0, x - cradius); - const int ymax = min(h - 1, y + cradius); - const int xmax = min(w - 1, x + cradius); + float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + const uchar* ic = img + y * img_step + channels * x; - const uchar* ic = img + y * img_step + channels * x; + for(int yi = ymin; yi <= ymax; yi++) + { + const T* disp_y = disp + yi * disp_step; - for(int yi = ymin; yi <= ymax; yi++) + for(int xi = xmin; xi <= xmax; xi++) { - const T* disp_y = disp + yi * disp_step; - - for(int xi = xmin; xi <= xmax; xi++) - { - const uchar* in = img + yi * img_step + channels * xi; + const uchar* in = img + yi * img_step + channels * xi; - uchar dist_rgb = DistRgbMax::calc(in, ic); + uchar dist_rgb = DistRgbMax::calc(in, ic); - const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)]; + const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)]; - const T disp_reg = disp_y[xi]; + const T disp_reg = disp_y[xi]; - cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight; - cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight; - cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight; - cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight; - cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight; - } + cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight; + cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight; + cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight; + cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight; + cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight; } + } - float minimum = numeric_limits::max(); - int id = 0; + float minimum = numeric_limits::max(); + int id = 0; - if (cost[0] < minimum) - { - minimum = cost[0]; - id = 0; - } - if (cost[1] < minimum) - { - minimum = cost[1]; - id = 1; - } - if (cost[2] < minimum) - { - minimum = cost[2]; - id = 2; - } - if (cost[3] < minimum) - { - minimum = cost[3]; - id = 3; - } - if (cost[4] < minimum) - { - minimum = cost[4]; - id = 4; - } - - *(disp + y * disp_step + x) = dp[id]; + if (cost[0] < minimum) + { + minimum = cost[0]; + id = 0; } + if (cost[1] < minimum) + { + minimum = cost[1]; + id = 1; + } + if (cost[2] < minimum) + { + minimum = cost[2]; + id = 2; + } + if (cost[3] < minimum) + { + minimum = cost[3]; + id = 3; + } + if (cost[4] < minimum) + { + minimum = cost[4]; + id = 4; + } + + *(disp + y * disp_step + x) = dp[id]; } } } -namespace cv { namespace gpu { namespace bf +template +void bilateral_filter_caller(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) { - template - void bilateral_filter_caller(const DevMem2D_& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream) - { - dim3 threads(32, 8, 1); - dim3 grid(1, 1, 1); - grid.x = divUp(disp.cols, threads.x << 1); - grid.y = divUp(disp.rows, threads.y); + dim3 threads(32, 8, 1); + dim3 grid(1, 1, 1); + grid.x = divUp(disp.cols, threads.x << 1); + grid.y = divUp(disp.rows, threads.y); - switch (channels) + switch (channels) + { + case 1: + for (int i = 0; i < iters; ++i) { - case 1: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - bf_krnls::bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - } - break; - case 3: - for (int i = 0; i < iters; ++i) - { - bf_krnls::bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - bf_krnls::bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); - cudaSafeCall( cudaGetLastError() ); - } - break; - default: - cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); + bilateral_filter<1><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + + bilateral_filter<1><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); } + break; + case 3: + for (int i = 0; i < iters; ++i) + { + bilateral_filter<3><<>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); - if (stream != 0) - cudaSafeCall( cudaDeviceSynchronize() ); + bilateral_filter<3><<>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols); + cudaSafeCall( cudaGetLastError() ); + } + break; + default: + cv::gpu::error("Unsupported channels count", __FILE__, __LINE__); } - void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } + if (stream != 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - void bilateral_filter_gpu(const DevMem2D_& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream) - { - bilateral_filter_caller(disp, img, channels, iters, stream); - } -}}} +void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) +{ + bilateral_filter_caller(disp, img, channels, iters, stream); +} + +void bilateral_filter_gpu(DevMem2D_ disp, DevMem2Db img, int channels, int iters, cudaStream_t stream) +{ + bilateral_filter_caller(disp, img, channels, iters, stream); +} + +} // namespace bilateral_filter + +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/blend.cu b/modules/gpu/src/cuda/blend.cu index 4b29a70509923e91d944768770ccaf0ded7cbfff..fca1b961ed81a2be183736a5e269a329455d39ad 100644 --- a/modules/gpu/src/cuda/blend.cu +++ b/modules/gpu/src/cuda/blend.cu @@ -42,81 +42,77 @@ #include "internal_shared.hpp" -using namespace cv::gpu; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu +namespace blend { + +template +__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep img1, const PtrStep img2, + const PtrStepf weights1, const PtrStepf weights2, PtrStep result) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - template - __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep img1, const PtrStep img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStep result) + if (y < rows && x < cols) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; + int x_ = x / cn; + float w1 = weights1.ptr(y)[x_]; + float w2 = weights2.ptr(y)[x_]; + T p1 = img1.ptr(y)[x]; + T p2 = img2.ptr(y)[x]; + result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); + } +} - if (y < rows && x < cols) - { - int x_ = x / cn; - float w1 = weights1.ptr(y)[x_]; - float w2 = weights2.ptr(y)[x_]; - T p1 = img1.ptr(y)[x]; - T p2 = img2.ptr(y)[x]; - result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f); - } - } +template +void blendLinearCaller(int rows, int cols, int cn, PtrStep img1, PtrStep img2, PtrStepf weights1, PtrStepf weights2, PtrStep result, cudaStream_t stream) +{ + dim3 threads(16, 16); + dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); + + blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); - template - void blendLinearCaller(int rows, int cols, int cn, const PtrStep& img1, const PtrStep& img2, - const PtrStepf& weights1, const PtrStepf& weights2, PtrStep result, cudaStream_t stream) - { - dim3 threads(16, 16); - dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y)); - - blendLinearKernel<<>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result); - cudaSafeCall( cudaGetLastError() ); + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); +} - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); - } +template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); +template void blendLinearCaller(int, int, int, PtrStep, PtrStep, PtrStepf, PtrStepf, PtrStep, cudaStream_t stream); - template void blendLinearCaller(int, int, int, const PtrStep&, const PtrStep&, - const PtrStepf&, const PtrStepf&, PtrStep, cudaStream_t stream); - template void blendLinearCaller(int, int, int, const PtrStep&, const PtrStep&, - const PtrStepf&, const PtrStepf&, PtrStep, cudaStream_t stream); +__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, + const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2, - const PtrStepf weights1, const PtrStepf weights2, PtrStepb result) + if (y < rows && x < cols) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - if (y < rows && x < cols) - { - float w1 = weights1.ptr(y)[x]; - float w2 = weights2.ptr(y)[x]; - float sum_inv = 1.f / (w1 + w2 + 1e-5f); - w1 *= sum_inv; - w2 *= sum_inv; - uchar4 p1 = ((const uchar4*)img1.ptr(y))[x]; - uchar4 p2 = ((const uchar4*)img2.ptr(y))[x]; - ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, - p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); - } + float w1 = weights1.ptr(y)[x]; + float w2 = weights2.ptr(y)[x]; + float sum_inv = 1.f / (w1 + w2 + 1e-5f); + w1 *= sum_inv; + w2 *= sum_inv; + uchar4 p1 = ((const uchar4*)img1.ptr(y))[x]; + uchar4 p2 = ((const uchar4*)img2.ptr(y))[x]; + ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2, + p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2); } +} +void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream) +{ + dim3 threads(16, 16); + dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); + + blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); + cudaSafeCall( cudaGetLastError() ); - void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, - const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream) - { - dim3 threads(16, 16); - dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y)); - - blendLinearKernel8UC4<<>>(rows, cols, img1, img2, weights1, weights2, result); - cudaSafeCall( cudaGetLastError() ); + if (stream == 0) + cudaSafeCall(cudaDeviceSynchronize()); +} - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); - } +} // namespace blend -}} \ No newline at end of file +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/calib3d.cu b/modules/gpu/src/cuda/calib3d.cu index 2a30393817635cd635e00e130520a4f3482a48df..1cdf191b0380c1f7891977e121cee4c8dc77a87f 100644 --- a/modules/gpu/src/cuda/calib3d.cu +++ b/modules/gpu/src/cuda/calib3d.cu @@ -44,153 +44,149 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/functional.hpp" -#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 +BEGIN_OPENCV_DEVICE_NAMESPACE -using namespace cv::gpu::device; +#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200 -namespace cv { namespace gpu +namespace transform_points { - namespace transform_points - { - __constant__ float3 crot0; - __constant__ float3 crot1; - __constant__ float3 crot2; - __constant__ float3 ctransl; + __constant__ float3 crot0; + __constant__ float3 crot1; + __constant__ float3 crot2; + __constant__ float3 ctransl; - struct TransformOp : unary_function - { - __device__ __forceinline__ float3 operator()(const float3& p) const - { - return make_float3( - crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, - crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, - crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); - } - }; - - void call(const DevMem2D_ src, const float* rot, - const float* transl, DevMem2D_ dst, - cudaStream_t stream) + struct TransformOp : unary_function + { + __device__ __forceinline__ float3 operator()(const float3& p) const { - cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); - transform(src, dst, TransformOp(), stream); + return make_float3( + crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, + crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, + crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); } - } // namespace transform_points - + }; - namespace project_points + void call(const DevMem2D_ src, const float* rot, + const float* transl, DevMem2D_ dst, + cudaStream_t stream) { - __constant__ float3 crot0; - __constant__ float3 crot1; - __constant__ float3 crot2; - __constant__ float3 ctransl; - __constant__ float3 cproj0; - __constant__ float3 cproj1; - - struct ProjectOp : unary_function - { - __device__ __forceinline__ float2 operator()(const float3& p) const - { - // Rotate and translate in 3D - float3 t = make_float3( - crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, - crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, - crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); - // Project on 2D plane - return make_float2( - (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, - (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); - } - }; - - void call(const DevMem2D_ src, const float* rot, - const float* transl, const float* proj, DevMem2D_ dst, - cudaStream_t stream) + cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); + OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream); + } +} // namespace transform_points + +namespace project_points +{ + __constant__ float3 crot0; + __constant__ float3 crot1; + __constant__ float3 crot2; + __constant__ float3 ctransl; + __constant__ float3 cproj0; + __constant__ float3 cproj1; + + struct ProjectOp : unary_function + { + __device__ __forceinline__ float2 operator()(const float3& p) const { - cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3)); - cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3)); - transform(src, dst, ProjectOp(), stream); + // Rotate and translate in 3D + float3 t = make_float3( + crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x, + crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y, + crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z); + // Project on 2D plane + return make_float2( + (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z, + (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z); } - } // namespace project_points + }; + void call(const DevMem2D_ src, const float* rot, + const float* transl, const float* proj, DevMem2D_ dst, + cudaStream_t stream) + { + cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3)); + cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3)); + OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream); + } +} // namespace project_points + +namespace solve_pnp_ransac +{ + __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3]; + __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS]; - namespace solve_pnp_ransac + int maxNumIters() { - __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3]; - __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS]; + return SOLVE_PNP_RANSAC_MAX_NUM_ITERS; + } - int maxNumIters() - { - return SOLVE_PNP_RANSAC_MAX_NUM_ITERS; - } + __device__ __forceinline__ float sqr(float x) + { + return x * x; + } + + __global__ void computeHypothesisScoresKernel( + const int num_points, const float3* object, const float2* image, + const float dist_threshold, int* g_num_inliers) + { + const float3* const &rot_mat = crot_matrices + blockIdx.x * 3; + const float3 &transl_vec = ctransl_vectors[blockIdx.x]; + int num_inliers = 0; - __device__ __forceinline__ float sqr(float x) + for (int i = threadIdx.x; i < num_points; i += blockDim.x) { - return x * x; + float3 p = object[i]; + p = make_float3( + rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x, + rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y, + rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z); + p.x /= p.z; + p.y /= p.z; + float2 image_p = image[i]; + if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold) + ++num_inliers; } - __global__ void computeHypothesisScoresKernel( - const int num_points, const float3* object, const float2* image, - const float dist_threshold, int* g_num_inliers) + extern __shared__ float s_num_inliers[]; + s_num_inliers[threadIdx.x] = num_inliers; + __syncthreads(); + + for (int step = blockDim.x / 2; step > 0; step >>= 1) { - const float3* const &rot_mat = crot_matrices + blockIdx.x * 3; - const float3 &transl_vec = ctransl_vectors[blockIdx.x]; - int num_inliers = 0; - - for (int i = threadIdx.x; i < num_points; i += blockDim.x) - { - float3 p = object[i]; - p = make_float3( - rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x, - rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y, - rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z); - p.x /= p.z; - p.y /= p.z; - float2 image_p = image[i]; - if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold) - ++num_inliers; - } - - extern __shared__ float s_num_inliers[]; - s_num_inliers[threadIdx.x] = num_inliers; + if (threadIdx.x < step) + s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; __syncthreads(); - - for (int step = blockDim.x / 2; step > 0; step >>= 1) - { - if (threadIdx.x < step) - s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step]; - __syncthreads(); - } - - if (threadIdx.x == 0) - g_num_inliers[blockIdx.x] = s_num_inliers[0]; } - void computeHypothesisScores( - const int num_hypotheses, const int num_points, const float* rot_matrices, - const float3* transl_vectors, const float3* object, const float2* image, - const float dist_threshold, int* hypothesis_scores) - { - cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3))); - cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3))); + if (threadIdx.x == 0) + g_num_inliers[blockIdx.x] = s_num_inliers[0]; + } - dim3 threads(256); - dim3 grid(num_hypotheses); - int smem_size = threads.x * sizeof(float); + void computeHypothesisScores( + const int num_hypotheses, const int num_points, const float* rot_matrices, + const float3* transl_vectors, const float3* object, const float2* image, + const float dist_threshold, int* hypothesis_scores) + { + cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3))); + cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3))); - computeHypothesisScoresKernel<<>>( - num_points, object, image, dist_threshold, hypothesis_scores); - cudaSafeCall( cudaGetLastError() ); + dim3 threads(256); + dim3 grid(num_hypotheses); + int smem_size = threads.x * sizeof(float); - cudaSafeCall( cudaDeviceSynchronize() ); - } - } // namespace solvepnp_ransac + computeHypothesisScoresKernel<<>>( + num_points, object, image, dist_threshold, hypothesis_scores); + cudaSafeCall( cudaGetLastError() ); + + cudaSafeCall( cudaDeviceSynchronize() ); + } +} // namespace solvepnp_ransac -}} // namespace cv { namespace gpu +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu index 4ea26fcb095f42deab26edd48ff7b5b07006919e..0c0f5d9a2c887cc9203a82e224ca1b623cbfdd08 100644 --- a/modules/gpu/src/cuda/canny.cu +++ b/modules/gpu/src/cuda/canny.cu @@ -44,339 +44,370 @@ #include #include "internal_shared.hpp" -using namespace cv::gpu; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace canny +namespace canny { + +__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) { - __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) - { - __shared__ int smem[16][18]; + __shared__ int smem[16][18]; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i < rows) + if (i < rows) + { + smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; + if (threadIdx.x == 0) { - smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j]; - if (threadIdx.x == 0) - { - smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)]; - smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)]; - } - __syncthreads(); + smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)]; + smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)]; + } + __syncthreads(); - if (j < cols) - { - dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; - dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; - } + if (j < cols) + { + dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2]; + dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2]; } } +} - void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); +void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); - cudaSafeCall( cudaGetLastError() ); + calcSobelRowPass<<>>(src, dx_buf, dy_buf, rows, cols); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); +} - struct L1 +struct L1 +{ + static __device__ __forceinline__ float calc(int x, int y) { - static __device__ __forceinline__ float calc(int x, int y) - { - return abs(x) + abs(y); - } - }; - struct L2 + return ::abs(x) + ::abs(y); + } +}; +struct L2 +{ + static __device__ __forceinline__ float calc(int x, int y) { - static __device__ __forceinline__ float calc(int x, int y) - { - return sqrtf(x * x + y * y); - } - }; + return ::sqrtf(x * x + y * y); + } +}; - template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, - PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - __shared__ int sdx[18][16]; - __shared__ int sdy[18][16]; +template __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, + PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) +{ + __shared__ int sdx[18][16]; + __shared__ int sdy[18][16]; - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; - if (j < cols) + if (j < cols) + { + sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; + sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; + if (threadIdx.y == 0) { - sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j]; - sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j]; - if (threadIdx.y == 0) - { - sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j]; - sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j]; + sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j]; + sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j]; - sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j]; - sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j]; - } - __syncthreads(); + sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j]; + sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j]; + } + __syncthreads(); - if (i < rows) - { - int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; - int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; + if (i < rows) + { + int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x]; + int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x]; - dx.ptr(i)[j] = x; - dy.ptr(i)[j] = y; + dx.ptr(i)[j] = x; + dy.ptr(i)[j] = y; - mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); - } + mag.ptr(i + 1)[j + 1] = Norm::calc(x, y); } } +} - void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); +void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - if (L2Grad) - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + if (L2Grad) + calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); + else + calcMagnitude<<>>(dx_buf, dy_buf, dx, dy, mag, rows, cols); - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); +} - template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) - { - const int j = blockIdx.x * blockDim.x + threadIdx.x; - const int i = blockIdx.y * blockDim.y + threadIdx.y; +template __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols) +{ + const int j = blockIdx.x * blockDim.x + threadIdx.x; + const int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i < rows && j < cols) - mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); - } + if (i < rows && j < cols) + mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]); +} - void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); +void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - if (L2Grad) - calcMagnitude<<>>(dx, dy, mag, rows, cols); - else - calcMagnitude<<>>(dx, dy, mag, rows, cols); + if (L2Grad) + calcMagnitude<<>>(dx, dy, mag, rows, cols); + else + calcMagnitude<<>>(dx, dy, mag, rows, cols); - cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); +} ////////////////////////////////////////////////////////////////////////////////////////// - + #define CANNY_SHIFT 15 #define TG22 (int)(0.4142135623730950488016887242097*(1< low_thresh) - { - const int tg22x = x * TG22; - const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); + if (m > low_thresh) + { + const int tg22x = x * TG22; + const int tg67x = tg22x + ((x + x) << CANNY_SHIFT); - y <<= CANNY_SHIFT; + y <<= CANNY_SHIFT; - if (y < tg22x) - { - if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else if( y > tg67x ) - { - if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } + if (y < tg22x) + { + if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2]) + edge_type = 1 + (int)(m > high_thresh); + } + else if( y > tg67x ) + { + if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1]) + edge_type = 1 + (int)(m > high_thresh); + } + else + { + if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s]) + edge_type = 1 + (int)(m > high_thresh); } - - map.ptr(i + 1)[j + 1] = edge_type; } + + map.ptr(i + 1)[j + 1] = edge_type; } +} #undef CANNY_SHIFT #undef TG22 - void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); +void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); - cudaSafeCall( cudaGetLastError() ); + calcMap<<>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } + cudaSafeCall(cudaThreadSynchronize()); +} ////////////////////////////////////////////////////////////////////////////////////////// - __device__ unsigned int counter = 0; - - __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) - { - #if __CUDA_ARCH__ >= 120 - - __shared__ int smem[18][18]; +__device__ unsigned int counter = 0; - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; +__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols) +{ + #if __CUDA_ARCH__ >= 120 - const int tid = threadIdx.y * 16 + threadIdx.x; - const int lx = tid % 18; - const int ly = tid / 18; + __shared__ int smem[18][18]; - if (ly < 14) - smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; + const int j = blockIdx.x * 16 + threadIdx.x; + const int i = blockIdx.y * 16 + threadIdx.y; - if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) - smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; + const int tid = threadIdx.y * 16 + threadIdx.x; + const int lx = tid % 18; + const int ly = tid / 18; - __syncthreads(); + if (ly < 14) + smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx]; - if (i < rows && j < cols) - { - int n; + if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols) + smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx]; - #pragma unroll - for (int k = 0; k < 16; ++k) - { - n = 0; + __syncthreads(); - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; - } - - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; - } - - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - - map.ptr(i + 1)[j + 1] = e; + if (i < rows && j < cols) + { + int n; + #pragma unroll + for (int k = 0; k < 16; ++k) + { n = 0; - if (e == 2) + if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + n += smem[threadIdx.y ][threadIdx.x ] == 2; + n += smem[threadIdx.y ][threadIdx.x + 1] == 2; + n += smem[threadIdx.y ][threadIdx.x + 2] == 2; - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + n += smem[threadIdx.y + 1][threadIdx.x ] == 2; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + n += smem[threadIdx.y + 2][threadIdx.x ] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; } if (n > 0) - { - const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); - st[ind] = make_ushort2(j + 1, i + 1); - } + smem[threadIdx.y + 1][threadIdx.x + 1] = 2; } - #endif - } + const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); + map.ptr(i + 1)[j + 1] = e; - edgesHysteresisLocal<<>>(map, st1, rows, cols); - cudaSafeCall( cudaGetLastError() ); + n = 0; - cudaSafeCall(cudaThreadSynchronize()); + if (e == 2) + { + n += smem[threadIdx.y ][threadIdx.x ] == 1; + n += smem[threadIdx.y ][threadIdx.x + 1] == 1; + n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 1][threadIdx.x ] == 1; + n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + + n += smem[threadIdx.y + 2][threadIdx.x ] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; + n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; + } + + if (n > 0) + { + const unsigned int ind = atomicInc(&counter, (unsigned int)(-1)); + st[ind] = make_ushort2(j + 1, i + 1); + } } - __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; - __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + #endif +} - __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) - { - #if __CUDA_ARCH__ >= 120 +void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - const int stack_size = 512; - - __shared__ unsigned int s_counter; - __shared__ unsigned int s_ind; - __shared__ ushort2 s_st[stack_size]; + edgesHysteresisLocal<<>>(map, st1, rows, cols); + cudaSafeCall( cudaGetLastError() ); - if (threadIdx.x == 0) - s_counter = 0; - __syncthreads(); + cudaSafeCall(cudaThreadSynchronize()); +} + +__constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; +__constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; + +__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count) +{ + #if __CUDA_ARCH__ >= 120 + + const int stack_size = 512; + + __shared__ unsigned int s_counter; + __shared__ unsigned int s_ind; + __shared__ ushort2 s_st[stack_size]; + + if (threadIdx.x == 0) + s_counter = 0; + __syncthreads(); - int ind = blockIdx.y * gridDim.x + blockIdx.x; + int ind = blockIdx.y * gridDim.x + blockIdx.x; - if (ind < count) + if (ind < count) + { + ushort2 pos = st1[ind]; + + if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) { - ushort2 pos = st1[ind]; + if (threadIdx.x < 8) + { + pos.x += c_dx[threadIdx.x]; + pos.y += c_dy[threadIdx.x]; + + if (map.ptr(pos.y)[pos.x] == 1) + { + map.ptr(pos.y)[pos.x] = 2; + + ind = atomicInc(&s_counter, (unsigned int)(-1)); + + s_st[ind] = pos; + } + } + __syncthreads(); - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) + while (s_counter > 0 && s_counter <= stack_size - blockDim.x) { - if (threadIdx.x < 8) + const int subTaskIdx = threadIdx.x >> 3; + const int portion = ::min(s_counter, blockDim.x >> 3); + + pos.x = pos.y = 0; + + if (subTaskIdx < portion) + pos = s_st[s_counter - 1 - subTaskIdx]; + __syncthreads(); + + if (threadIdx.x == 0) + s_counter -= portion; + __syncthreads(); + + if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) { - pos.x += c_dx[threadIdx.x]; - pos.y += c_dy[threadIdx.x]; + pos.x += c_dx[threadIdx.x & 7]; + pos.y += c_dy[threadIdx.x & 7]; if (map.ptr(pos.y)[pos.x] == 1) { @@ -388,103 +419,75 @@ namespace cv { namespace gpu { namespace canny } } __syncthreads(); + } - while (s_counter > 0 && s_counter <= stack_size - blockDim.x) + if (s_counter > 0) + { + if (threadIdx.x == 0) { - const int subTaskIdx = threadIdx.x >> 3; - const int portion = min(s_counter, blockDim.x >> 3); - - pos.x = pos.y = 0; - - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - __syncthreads(); - - if (threadIdx.x == 0) - s_counter -= portion; - __syncthreads(); - - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) - { - pos.x += c_dx[threadIdx.x & 7]; - pos.y += c_dy[threadIdx.x & 7]; - - if (map.ptr(pos.y)[pos.x] == 1) - { - map.ptr(pos.y)[pos.x] = 2; - - ind = atomicInc(&s_counter, (unsigned int)(-1)); - - s_st[ind] = pos; - } - } - __syncthreads(); + ind = atomicAdd(&counter, s_counter); + s_ind = ind - s_counter; } + __syncthreads(); - if (s_counter > 0) - { - if (threadIdx.x == 0) - { - ind = atomicAdd(&counter, s_counter); - s_ind = ind - s_counter; - } - __syncthreads(); - - ind = s_ind; + ind = s_ind; - for (int i = threadIdx.x; i < s_counter; i += blockDim.x) - { - st2[ind + i] = s_st[i]; - } + for (int i = threadIdx.x; i < s_counter; i += blockDim.x) + { + st2[ind + i] = s_st[i]; } } } - - #endif } - void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) - { - void* counter_ptr; - cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") ); - - unsigned int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + #endif +} - while (count > 0) - { - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); +void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols) +{ + void* counter_ptr; + cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); + + unsigned int count; + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); - dim3 block(128, 1, 1); - dim3 grid(min(count, 65535u), divUp(count, 65535), 1); - edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); - cudaSafeCall( cudaGetLastError() ); + while (count > 0) + { + cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) ); - cudaSafeCall(cudaThreadSynchronize()); + dim3 block(128, 1, 1); + dim3 grid(min(count, 65535u), divUp(count, 65535), 1); + edgesHysteresisGlobal<<>>(map, st1, st2, rows, cols, count); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + cudaSafeCall(cudaThreadSynchronize()); - std::swap(st1, st2); - } + cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) ); + + std::swap(st1, st2); } +} - __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) - { - const int j = blockIdx.x * 16 + threadIdx.x; - const int i = blockIdx.y * 16 + threadIdx.y; +__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols) +{ + const int j = blockIdx.x * 16 + threadIdx.x; + const int i = blockIdx.y * 16 + threadIdx.y; - if (i < rows && j < cols) - dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); - } + if (i < rows && j < cols) + dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1)); +} - void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) - { - dim3 block(16, 16, 1); - dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); +void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols) +{ + dim3 block(16, 16, 1); + dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1); - getEdges<<>>(map, dst, rows, cols); - cudaSafeCall( cudaGetLastError() ); + getEdges<<>>(map, dst, rows, cols); + cudaSafeCall( cudaGetLastError() ); - cudaSafeCall(cudaThreadSynchronize()); - } -}}} + cudaSafeCall(cudaThreadSynchronize()); +} + +} // namespace canny + +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/color.cu b/modules/gpu/src/cuda/color.cu index dfa74290efae2c38a60a44a8adeedc0eabd6c941..4da3f77c7b40b352a8ce87584e45d2e5ac5274c7 100644 --- a/modules/gpu/src/cuda/color.cu +++ b/modules/gpu/src/cuda/color.cu @@ -44,336 +44,337 @@ #include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/color.hpp" -namespace cv { namespace gpu { namespace device -{ - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_x = 8 }; - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits : DefaultTransformFunctorTraits - { - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - template <> struct TransformFunctorTraits::functor_type> : DefaultTransformFunctorTraits::functor_type> - { - enum { smart_block_dim_y = 8 }; - enum { smart_shift = 4 }; - }; - - #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \ - void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \ - { \ - traits::functor_type functor = traits::create_functor(); \ - typedef typename traits::functor_type::argument_type src_t; \ - typedef typename traits::functor_type::result_type dst_t; \ - transform((DevMem2D_)src, (DevMem2D_)dst, functor, stream); \ - } - - #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits) - - #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits) - - #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits) \ - OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4) - - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr) - OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra) - - #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR - #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE - #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL - #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F -}}} +BEGIN_OPENCV_DEVICE_NAMESPACE + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_x = 8 }; + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type) +{ + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type) +{ + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; +DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits::functor_type) +{ + enum { smart_block_dim_y = 8 }; + enum { smart_shift = 4 }; +}; + +#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \ + void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \ + { \ + traits::functor_type functor = traits::create_functor(); \ + typedef typename traits::functor_type::argument_type src_t; \ + typedef typename traits::functor_type::result_type dst_t; \ + OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_)src, (DevMem2D_)dst, functor, stream); \ + } + +#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits) + +#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits) + +#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits) \ + OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4) + +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr) +OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra) + +#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR +#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE +#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL +#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F + +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index 29b828e7ce19ea13e4be3273c5a183d290d5f0d2..c16ca82d812b5601963362d23edcaa99aad998d8 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -47,8 +47,7 @@ #include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE #define MAX_KERNEL_SIZE 16 #define BLOCK_DIM_X 16 @@ -56,195 +55,195 @@ using namespace cv::gpu::device; #define RESULT_STEPS 8 #define HALO_STEPS 1 -namespace filter_column +namespace column_filter { + +__constant__ float c_kernel[MAX_KERNEL_SIZE]; + +void loadKernel(const float kernel[], int ksize) { - __constant__ float c_kernel[MAX_KERNEL_SIZE]; + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); +} - void loadKernel(const float kernel[], int ksize) - { - cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); - } +template +__global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) +{ + typedef typename TypeVec::cn>::vec_type sum_t; - template - __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep dst, int anchor, const B b) - { - typedef typename TypeVec::cn>::vec_type sum_t; + __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; - __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; + //Offset to the upper halo edge + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; - //Offset to the upper halo edge - const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; - const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; + if (x < src.cols) + { + const T* src_col = src.ptr() + x; - if (x < src.cols) - { - const T* src_col = src.ptr() + x; + //Main data + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); - //Main data - #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + //Upper halo + #pragma unroll + for(int i = 0; i < HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); - //Upper halo - #pragma unroll - for(int i = 0; i < HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); + //Lower halo + #pragma unroll + for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); - //Lower halo - #pragma unroll - for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) - smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + __syncthreads(); - __syncthreads(); + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + { + sum_t sum = VecTraits::all(0); #pragma unroll - for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) - { - sum_t sum = VecTraits::all(0); - - #pragma unroll - for(int j = 0; j < KERNEL_SIZE; ++j) - sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; + for(int j = 0; j < KERNEL_SIZE; ++j) + sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; - int dstY = y + i * BLOCK_DIM_Y; + int dstY = y + i * BLOCK_DIM_Y; - if (dstY < src.rows) - dst.ptr(dstY)[x] = saturate_cast(sum); - } + if (dstY < src.rows) + dst.ptr(dstY)[x] = saturate_cast(sum); } } } -namespace cv { namespace gpu { namespace filters -{ - template class B> - void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) - { - const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); - const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); +template class B> +void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) +{ + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); - B b(src.rows); + B b(src.rows); - filter_column::linearColumnFilter<<>>(src, dst, anchor, b); - cudaSafeCall( cudaGetLastError() ); + linearColumnFilter<<>>(src, dst, anchor, b); + cudaSafeCall( cudaGetLastError() ); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); - } + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); +} - template - void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) +template +void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream) +{ + typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); + static const caller_t callers[5][17] = { - typedef void (*caller_t)(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream); - static const caller_t callers[5][17] = { - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect101>, - linearColumnFilter_caller<2 , T, D, BrdColReflect101>, - linearColumnFilter_caller<3 , T, D, BrdColReflect101>, - linearColumnFilter_caller<4 , T, D, BrdColReflect101>, - linearColumnFilter_caller<5 , T, D, BrdColReflect101>, - linearColumnFilter_caller<6 , T, D, BrdColReflect101>, - linearColumnFilter_caller<7 , T, D, BrdColReflect101>, - linearColumnFilter_caller<8 , T, D, BrdColReflect101>, - linearColumnFilter_caller<9 , T, D, BrdColReflect101>, - linearColumnFilter_caller<10, T, D, BrdColReflect101>, - linearColumnFilter_caller<11, T, D, BrdColReflect101>, - linearColumnFilter_caller<12, T, D, BrdColReflect101>, - linearColumnFilter_caller<13, T, D, BrdColReflect101>, - linearColumnFilter_caller<14, T, D, BrdColReflect101>, - linearColumnFilter_caller<15, T, D, BrdColReflect101>, - linearColumnFilter_caller<16, T, D, BrdColReflect101> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReplicate>, - linearColumnFilter_caller<2 , T, D, BrdColReplicate>, - linearColumnFilter_caller<3 , T, D, BrdColReplicate>, - linearColumnFilter_caller<4 , T, D, BrdColReplicate>, - linearColumnFilter_caller<5 , T, D, BrdColReplicate>, - linearColumnFilter_caller<6 , T, D, BrdColReplicate>, - linearColumnFilter_caller<7 , T, D, BrdColReplicate>, - linearColumnFilter_caller<8 , T, D, BrdColReplicate>, - linearColumnFilter_caller<9 , T, D, BrdColReplicate>, - linearColumnFilter_caller<10, T, D, BrdColReplicate>, - linearColumnFilter_caller<11, T, D, BrdColReplicate>, - linearColumnFilter_caller<12, T, D, BrdColReplicate>, - linearColumnFilter_caller<13, T, D, BrdColReplicate>, - linearColumnFilter_caller<14, T, D, BrdColReplicate>, - linearColumnFilter_caller<15, T, D, BrdColReplicate>, - linearColumnFilter_caller<16, T, D, BrdColReplicate> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColConstant>, - linearColumnFilter_caller<2 , T, D, BrdColConstant>, - linearColumnFilter_caller<3 , T, D, BrdColConstant>, - linearColumnFilter_caller<4 , T, D, BrdColConstant>, - linearColumnFilter_caller<5 , T, D, BrdColConstant>, - linearColumnFilter_caller<6 , T, D, BrdColConstant>, - linearColumnFilter_caller<7 , T, D, BrdColConstant>, - linearColumnFilter_caller<8 , T, D, BrdColConstant>, - linearColumnFilter_caller<9 , T, D, BrdColConstant>, - linearColumnFilter_caller<10, T, D, BrdColConstant>, - linearColumnFilter_caller<11, T, D, BrdColConstant>, - linearColumnFilter_caller<12, T, D, BrdColConstant>, - linearColumnFilter_caller<13, T, D, BrdColConstant>, - linearColumnFilter_caller<14, T, D, BrdColConstant>, - linearColumnFilter_caller<15, T, D, BrdColConstant>, - linearColumnFilter_caller<16, T, D, BrdColConstant> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColReflect>, - linearColumnFilter_caller<2 , T, D, BrdColReflect>, - linearColumnFilter_caller<3 , T, D, BrdColReflect>, - linearColumnFilter_caller<4 , T, D, BrdColReflect>, - linearColumnFilter_caller<5 , T, D, BrdColReflect>, - linearColumnFilter_caller<6 , T, D, BrdColReflect>, - linearColumnFilter_caller<7 , T, D, BrdColReflect>, - linearColumnFilter_caller<8 , T, D, BrdColReflect>, - linearColumnFilter_caller<9 , T, D, BrdColReflect>, - linearColumnFilter_caller<10, T, D, BrdColReflect>, - linearColumnFilter_caller<11, T, D, BrdColReflect>, - linearColumnFilter_caller<12, T, D, BrdColReflect>, - linearColumnFilter_caller<13, T, D, BrdColReflect>, - linearColumnFilter_caller<14, T, D, BrdColReflect>, - linearColumnFilter_caller<15, T, D, BrdColReflect>, - linearColumnFilter_caller<16, T, D, BrdColReflect> - }, - { - 0, - linearColumnFilter_caller<1 , T, D, BrdColWrap>, - linearColumnFilter_caller<2 , T, D, BrdColWrap>, - linearColumnFilter_caller<3 , T, D, BrdColWrap>, - linearColumnFilter_caller<4 , T, D, BrdColWrap>, - linearColumnFilter_caller<5 , T, D, BrdColWrap>, - linearColumnFilter_caller<6 , T, D, BrdColWrap>, - linearColumnFilter_caller<7 , T, D, BrdColWrap>, - linearColumnFilter_caller<8 , T, D, BrdColWrap>, - linearColumnFilter_caller<9 , T, D, BrdColWrap>, - linearColumnFilter_caller<10, T, D, BrdColWrap>, - linearColumnFilter_caller<11, T, D, BrdColWrap>, - linearColumnFilter_caller<12, T, D, BrdColWrap>, - linearColumnFilter_caller<13, T, D, BrdColWrap>, - linearColumnFilter_caller<14, T, D, BrdColWrap>, - linearColumnFilter_caller<15, T, D, BrdColWrap>, - linearColumnFilter_caller<16, T, D, BrdColWrap>, - } - }; - - filter_column::loadKernel(kernel, ksize); - - callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); - } + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect101>, + linearColumnFilter_caller<2 , T, D, BrdColReflect101>, + linearColumnFilter_caller<3 , T, D, BrdColReflect101>, + linearColumnFilter_caller<4 , T, D, BrdColReflect101>, + linearColumnFilter_caller<5 , T, D, BrdColReflect101>, + linearColumnFilter_caller<6 , T, D, BrdColReflect101>, + linearColumnFilter_caller<7 , T, D, BrdColReflect101>, + linearColumnFilter_caller<8 , T, D, BrdColReflect101>, + linearColumnFilter_caller<9 , T, D, BrdColReflect101>, + linearColumnFilter_caller<10, T, D, BrdColReflect101>, + linearColumnFilter_caller<11, T, D, BrdColReflect101>, + linearColumnFilter_caller<12, T, D, BrdColReflect101>, + linearColumnFilter_caller<13, T, D, BrdColReflect101>, + linearColumnFilter_caller<14, T, D, BrdColReflect101>, + linearColumnFilter_caller<15, T, D, BrdColReflect101>, + linearColumnFilter_caller<16, T, D, BrdColReflect101> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReplicate>, + linearColumnFilter_caller<2 , T, D, BrdColReplicate>, + linearColumnFilter_caller<3 , T, D, BrdColReplicate>, + linearColumnFilter_caller<4 , T, D, BrdColReplicate>, + linearColumnFilter_caller<5 , T, D, BrdColReplicate>, + linearColumnFilter_caller<6 , T, D, BrdColReplicate>, + linearColumnFilter_caller<7 , T, D, BrdColReplicate>, + linearColumnFilter_caller<8 , T, D, BrdColReplicate>, + linearColumnFilter_caller<9 , T, D, BrdColReplicate>, + linearColumnFilter_caller<10, T, D, BrdColReplicate>, + linearColumnFilter_caller<11, T, D, BrdColReplicate>, + linearColumnFilter_caller<12, T, D, BrdColReplicate>, + linearColumnFilter_caller<13, T, D, BrdColReplicate>, + linearColumnFilter_caller<14, T, D, BrdColReplicate>, + linearColumnFilter_caller<15, T, D, BrdColReplicate>, + linearColumnFilter_caller<16, T, D, BrdColReplicate> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColConstant>, + linearColumnFilter_caller<2 , T, D, BrdColConstant>, + linearColumnFilter_caller<3 , T, D, BrdColConstant>, + linearColumnFilter_caller<4 , T, D, BrdColConstant>, + linearColumnFilter_caller<5 , T, D, BrdColConstant>, + linearColumnFilter_caller<6 , T, D, BrdColConstant>, + linearColumnFilter_caller<7 , T, D, BrdColConstant>, + linearColumnFilter_caller<8 , T, D, BrdColConstant>, + linearColumnFilter_caller<9 , T, D, BrdColConstant>, + linearColumnFilter_caller<10, T, D, BrdColConstant>, + linearColumnFilter_caller<11, T, D, BrdColConstant>, + linearColumnFilter_caller<12, T, D, BrdColConstant>, + linearColumnFilter_caller<13, T, D, BrdColConstant>, + linearColumnFilter_caller<14, T, D, BrdColConstant>, + linearColumnFilter_caller<15, T, D, BrdColConstant>, + linearColumnFilter_caller<16, T, D, BrdColConstant> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColReflect>, + linearColumnFilter_caller<2 , T, D, BrdColReflect>, + linearColumnFilter_caller<3 , T, D, BrdColReflect>, + linearColumnFilter_caller<4 , T, D, BrdColReflect>, + linearColumnFilter_caller<5 , T, D, BrdColReflect>, + linearColumnFilter_caller<6 , T, D, BrdColReflect>, + linearColumnFilter_caller<7 , T, D, BrdColReflect>, + linearColumnFilter_caller<8 , T, D, BrdColReflect>, + linearColumnFilter_caller<9 , T, D, BrdColReflect>, + linearColumnFilter_caller<10, T, D, BrdColReflect>, + linearColumnFilter_caller<11, T, D, BrdColReflect>, + linearColumnFilter_caller<12, T, D, BrdColReflect>, + linearColumnFilter_caller<13, T, D, BrdColReflect>, + linearColumnFilter_caller<14, T, D, BrdColReflect>, + linearColumnFilter_caller<15, T, D, BrdColReflect>, + linearColumnFilter_caller<16, T, D, BrdColReflect> + }, + { + 0, + linearColumnFilter_caller<1 , T, D, BrdColWrap>, + linearColumnFilter_caller<2 , T, D, BrdColWrap>, + linearColumnFilter_caller<3 , T, D, BrdColWrap>, + linearColumnFilter_caller<4 , T, D, BrdColWrap>, + linearColumnFilter_caller<5 , T, D, BrdColWrap>, + linearColumnFilter_caller<6 , T, D, BrdColWrap>, + linearColumnFilter_caller<7 , T, D, BrdColWrap>, + linearColumnFilter_caller<8 , T, D, BrdColWrap>, + linearColumnFilter_caller<9 , T, D, BrdColWrap>, + linearColumnFilter_caller<10, T, D, BrdColWrap>, + linearColumnFilter_caller<11, T, D, BrdColWrap>, + linearColumnFilter_caller<12, T, D, BrdColWrap>, + linearColumnFilter_caller<13, T, D, BrdColWrap>, + linearColumnFilter_caller<14, T, D, BrdColWrap>, + linearColumnFilter_caller<15, T, D, BrdColWrap>, + linearColumnFilter_caller<16, T, D, BrdColWrap>, + } + }; + + loadKernel(kernel, ksize); + + callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); +} + +template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +//template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +//template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); +template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); + +} // namespace column_filter - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - //template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - template void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); -}}} +END_OPENCV_DEVICE_NAMESPACE diff --git a/modules/gpu/src/cuda/copy_make_border.cu b/modules/gpu/src/cuda/copy_make_border.cu index 89af4e07e98e04d66f2be94e16e9ea4b1e31cd48..5603742302d8e0ad332890c7675c61af4ea59979 100644 --- a/modules/gpu/src/cuda/copy_make_border.cu +++ b/modules/gpu/src/cuda/copy_make_border.cu @@ -43,85 +43,87 @@ #include "internal_shared.hpp" #include "opencv2/gpu/device/border_interpolate.hpp" -using namespace cv::gpu; -using namespace cv::gpu::device; +BEGIN_OPENCV_DEVICE_NAMESPACE -namespace cv { namespace gpu { namespace imgproc +namespace copy_make_border { + +template __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_ dst, int top, int left) { - template __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_ dst, int top, int left) - { - const int x = blockDim.x * blockIdx.x + threadIdx.x; - const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + + if (x < dst.cols && y < dst.rows) + dst.ptr(y)[x] = src(y - top, x - left); +} + +template