From 8660e048bc12c348ccfc17d42e97ea7af3aa34b0 Mon Sep 17 00:00:00 2001 From: Alexander Smorkalov Date: Fri, 13 Dec 2013 17:28:29 +0400 Subject: [PATCH] Dynamic CUDA support library loading implemented for Linux. Logical mistake in macro fixed; DeviceInfo deligate reimplemented; Build and warning fixes. --- modules/core/CMakeLists.txt | 64 +++- modules/core/cuda/CMakeLists.txt | 3 +- modules/core/cuda/main.cpp | 29 +- modules/core/include/opencv2/core/gpumat.hpp | 3 + modules/core/src/gpumat.cpp | 93 ++++- modules/core/src/gpumat_cuda.hpp | 382 +++++++++---------- 6 files changed, 352 insertions(+), 222 deletions(-) diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt index 5951982926..a7a997f67b 100644 --- a/modules/core/CMakeLists.txt +++ b/modules/core/CMakeLists.txt @@ -1,29 +1,69 @@ set(the_description "The Core Functionality") +macro(ocv_glob_module_sources_no_cuda) + file(GLOB_RECURSE lib_srcs "src/*.cpp") + file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h") + file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") + file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") + + set(cuda_objs "") + set(lib_cuda_hdrs "") + if(HAVE_CUDA) + ocv_include_directories(${CUDA_INCLUDE_DIRS}) + file(GLOB lib_cuda_hdrs "src/cuda/*.hpp") + endif() + + source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) + + file(GLOB cl_kernels "src/opencl/*.cl") + if(HAVE_opencv_ocl AND cl_kernels) + ocv_include_directories(${OPENCL_INCLUDE_DIRS}) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp" + COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" + DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") + source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") + list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp") + endif() + + source_group("Include" FILES ${lib_hdrs}) + source_group("Include\\detail" FILES ${lib_hdrs_detail}) + + ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} + SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs}) +endmacro() + +ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) +ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) + if(HAVE_WINRT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"") endif() -file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") -file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") - -source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) -source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) - if(DYNAMIC_CUDA_SUPPORT) add_definitions(-DDYNAMIC_CUDA_SUPPORT) +else() + add_definitions(-DUSE_CUDA) endif() -ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES}) -ocv_module_include_directories(${ZLIB_INCLUDE_DIR}) - if(HAVE_CUDA) ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include") ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef) endif() -ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" - HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +file(GLOB lib_cuda_hdrs "include/opencv2/${name}/cuda/*.hpp" "include/opencv2/${name}/cuda/*.h") +file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h") + +source_group("Cuda Headers" FILES ${lib_cuda_hdrs}) +source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail}) + +if (DYNAMIC_CUDA_SUPPORT) + ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +else() + ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc" + HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail}) +endif() ocv_create_module() ocv_add_precompiled_headers(${the_module}) @@ -31,6 +71,6 @@ ocv_add_precompiled_headers(${the_module}) ocv_add_accuracy_tests() ocv_add_perf_tests() -if(DYNAMIC_CUDA_SUPPORT) +if (DYNAMIC_CUDA_SUPPORT) add_subdirectory(cuda) endif() diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt index 0b1c9428d3..72ecea7a4c 100644 --- a/modules/core/cuda/CMakeLists.txt +++ b/modules/core/cuda/CMakeLists.txt @@ -1,6 +1,5 @@ project(opencv_core_cuda) -set(HAVE_CUDA FALSE) -add_definitions("-DHAVE_CUDA") +add_definitions(-DUSE_CUDA) include_directories(${CUDA_INCLUDE_DIRS} "../src/" "../include/opencv2/core/" diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp index c4b8cbe1db..26d4834201 100644 --- a/modules/core/cuda/main.cpp +++ b/modules/core/cuda/main.cpp @@ -1,6 +1,10 @@ +#include "cvconfig.h" #include "opencv2/core/core.hpp" #include "opencv2/core/gpumat.hpp" +#include +#include + #ifdef HAVE_CUDA #include #include @@ -17,7 +21,30 @@ #endif #endif +using namespace std; using namespace cv; using namespace cv::gpu; -#include "gpumat_cuda.hpp" \ No newline at end of file +#include "gpumat_cuda.hpp" + +#ifdef HAVE_CUDA +static CudaDeviceInfoFuncTable deviceInfoTable; +static CudaFuncTable gpuTable; +#else +static EmptyDeviceInfoFuncTable deviceInfoTable; +static EmptyFuncTable gpuTable; +#endif + +extern "C" { + +DeviceInfoFuncTable* deviceInfoFactory() +{ + return (DeviceInfoFuncTable*)&deviceInfoTable; +} + +GpuFuncTable* gpuFactory() +{ + return (GpuFuncTable*)&gpuTable; +} + +} diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index b502102139..d62c8749b0 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -137,6 +137,9 @@ namespace cv { namespace gpu int deviceID() const { return device_id_; } private: + // Private section is fictive to preserve bin compatibility. + // Changes in the private fields there have no effects. + // see deligate code. void query(); int device_id_; diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp index 9a2e36cb62..f438dfd8b6 100644 --- a/modules/core/src/gpumat.cpp +++ b/modules/core/src/gpumat.cpp @@ -43,8 +43,9 @@ #include "precomp.hpp" #include "opencv2/core/gpumat.hpp" #include +#include -#if defined(HAVE_CUDA) +#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) #include #include @@ -66,15 +67,81 @@ using namespace cv::gpu; #include "gpumat_cuda.hpp" -namespace +typedef GpuFuncTable* (*GpuFactoryType)(); +typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)(); + +static GpuFactoryType gpuFactory = NULL; +static DeviceInfoFactoryType deviceInfoFactory = NULL; + +static const std::string getCudaSupportLibName() +{ + return "libopencv_core_cuda.so"; +} + +static bool loadCudaSupportLib() { - const GpuFuncTable* gpuFuncTable() + void* handle; + const std::string name = getCudaSupportLibName(); + handle = dlopen(name.c_str(), RTLD_LAZY); + if (!handle) + return false; + + deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory"); + if (!deviceInfoFactory) { - static EmptyFuncTable funcTable; - return &funcTable; + dlclose(handle); + return false; } + + gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory"); + if (!gpuFactory) + { + dlclose(handle); + return false; + } + + dlclose(handle); + + return true; } +static GpuFuncTable* gpuFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyFuncTable stub; + static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub; + static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaFuncTable impl; + static GpuFuncTable* funcTable = &impl; +#else + static EmptyFuncTable stub; + static GpuFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; +} + +static DeviceInfoFuncTable* deviceInfoFuncTable() +{ +#ifdef DYNAMIC_CUDA_SUPPORT + static EmptyDeviceInfoFuncTable stub; + static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub; + static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub; +#else +# ifdef USE_CUDA + static CudaDeviceInfoFuncTable impl; + static DeviceInfoFuncTable* funcTable = &impl; +#else + static EmptyFuncTable stub; + static DeviceInfoFuncTable* funcTable = &stub; +#endif +#endif + return funcTable; +} + + //////////////////////////////// Initialization & Info //////////////////////// int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); } @@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); } bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); } -size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); } -void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); } -size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); } -size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); } -bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); } -bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); } -void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); } +size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); } +void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); } +size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); } +size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); } +bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); } +bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); } +void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); } void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); } void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); } @@ -556,7 +623,7 @@ namespace cv { namespace gpu void setTo(GpuMat& src, Scalar s, cudaStream_t stream) { - gpuFuncTable()->setTo(src, s, stream); + gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream); } void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream) diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp index 631d6ea8ca..56d626a5cc 100644 --- a/modules/core/src/gpumat_cuda.hpp +++ b/modules/core/src/gpumat_cuda.hpp @@ -1,30 +1,19 @@ -namespace -{ -#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT) - - #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) - #define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) +#ifndef __GPUMAT_CUDA_HPP__ +#define __GPUMAT_CUDA_HPP__ - inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") + class DeviceInfoFuncTable { - if (cudaSuccess != err) - cv::gpu::error(cudaGetErrorString(err), file, line, func); - } - - inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") - { - if (err < 0) - { - std::ostringstream msg; - msg << "NPP API Call Error: " << err; - cv::gpu::error(msg.str().c_str(), file, line, func); - } - } -#endif -} - -namespace -{ + public: + virtual size_t sharedMemPerBlock() const = 0; + virtual void queryMemory(size_t&, size_t&) const = 0; + virtual size_t freeMemory() const = 0; + virtual size_t totalMemory() const = 0; + virtual bool supports(FeatureSet) const = 0; + virtual bool isCompatible() const = 0; + virtual void query() = 0; + virtual ~DeviceInfoFuncTable() {}; + }; + class GpuFuncTable { public: @@ -40,6 +29,7 @@ namespace virtual bool deviceSupports(FeatureSet) const = 0; + // TargetArchs virtual bool builtWith(FeatureSet) const = 0; virtual bool has(int, int) const = 0; virtual bool hasPtx(int, int) const = 0; @@ -49,14 +39,6 @@ namespace virtual bool hasEqualOrGreaterPtx(int, int) const = 0; virtual bool hasEqualOrGreaterBin(int, int) const = 0; - virtual size_t sharedMemPerBlock() const = 0; - virtual void queryMemory(size_t&, size_t&) const = 0; - virtual size_t freeMemory() const = 0; - virtual size_t totalMemory() const = 0; - virtual bool supports(FeatureSet) const = 0; - virtual bool isCompatible() const = 0; - virtual void query() const = 0; - virtual void printCudaDeviceInfo(int) const = 0; virtual void printShortCudaDeviceInfo(int) const = 0; @@ -72,17 +54,24 @@ namespace virtual void convert(const GpuMat& src, GpuMat& dst) const = 0; // for gpu::device::setTo funcs - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0; virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0; virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0; virtual void free(void* devPtr) const = 0; }; -} -#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT) -namespace -{ + class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable + { + public: + size_t sharedMemPerBlock() const { throw_nogpu; return 0; } + void queryMemory(size_t&, size_t&) const { throw_nogpu; } + size_t freeMemory() const { throw_nogpu; return 0; } + size_t totalMemory() const { throw_nogpu; return 0; } + bool supports(FeatureSet) const { throw_nogpu; return false; } + bool isCompatible() const { throw_nogpu; return false; } + void query() { throw_nogpu; } + }; + class EmptyFuncTable : public GpuFuncTable { public: @@ -105,15 +94,7 @@ namespace bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; } bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; } - - size_t sharedMemPerBlock() const { throw_nogpu; return 0; } - void queryMemory(size_t&, size_t&) const { throw_nogpu; } - size_t freeMemory() const { throw_nogpu; return 0; } - size_t totalMemory() const { throw_nogpu; return 0; } - bool supports(FeatureSet) const { throw_nogpu; return false; } - bool isCompatible() const { throw_nogpu; return false; } - void query() const { throw_nogpu; } - + void printCudaDeviceInfo(int) const { throw_nogpu; } void printShortCudaDeviceInfo(int) const { throw_nogpu; } @@ -126,15 +107,32 @@ namespace void convert(const GpuMat&, GpuMat&) const { throw_nogpu; } void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; } - virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; } virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; } void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; } void free(void*) const {} }; + +#if defined(USE_CUDA) + +#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func) +#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func) + +inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") +{ + if (cudaSuccess != err) + cv::gpu::error(cudaGetErrorString(err), file, line, func); } -#else +inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") +{ + if (err < 0) + { + std::ostringstream msg; + msg << "NPP API Call Error: " << err; + cv::gpu::error(msg.str().c_str(), file, line, func); + } +} namespace cv { namespace gpu { namespace device { @@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream); }}} -namespace -{ template void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream) { Scalar_ sf = s; @@ -162,10 +158,7 @@ namespace Scalar_ sf = s; cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream); } -} -namespace -{ template struct NPPTypeTraits; template<> struct NPPTypeTraits { typedef Npp8u npp_type; }; template<> struct NPPTypeTraits { typedef Npp8s npp_type; }; @@ -208,6 +201,7 @@ namespace cudaSafeCall( cudaDeviceSynchronize() ); } }; + template::func_ptr func> struct NppCvt { typedef typename NPPTypeTraits::npp_type dst_t; @@ -361,9 +355,8 @@ namespace { return reinterpret_cast(ptr) % size == 0; } -} - namespace cv { namespace gpu { namespace devices + namespace cv { namespace gpu { namespace device { void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0) { @@ -418,74 +411,52 @@ namespace { setTo(src, s, mask, 0); } - }} + }}} -namespace -{ - class CudaFuncTable : public GpuFuncTable + + class CudaArch { - protected: - - class CudaArch - { - public: - CudaArch(); - - bool builtWith(FeatureSet feature_set) const; - bool hasPtx(int major, int minor) const; - bool hasBin(int major, int minor) const; - bool hasEqualOrLessPtx(int major, int minor) const; - bool hasEqualOrGreaterPtx(int major, int minor) const; - bool hasEqualOrGreaterBin(int major, int minor) const; - - private: - static void fromStr(const string& set_as_str, vector& arr); - - vector bin; - vector ptx; - vector features; - }; - - const CudaArch cudaArch; - - CudaArch::CudaArch() + public: + CudaArch() { fromStr(CUDA_ARCH_BIN, bin); fromStr(CUDA_ARCH_PTX, ptx); fromStr(CUDA_ARCH_FEATURES, features); } - bool CudaArch::builtWith(FeatureSet feature_set) const + bool builtWith(FeatureSet feature_set) const { return !features.empty() && (features.back() >= feature_set); } - bool CudaArch::hasPtx(int major, int minor) const + bool hasPtx(int major, int minor) const { return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end(); } - bool CudaArch::hasBin(int major, int minor) const + bool hasBin(int major, int minor) const { return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end(); } - bool CudaArch::hasEqualOrLessPtx(int major, int minor) const + bool hasEqualOrLessPtx(int major, int minor) const { return !ptx.empty() && (ptx.front() <= major * 10 + minor); } - bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const + bool hasEqualOrGreaterPtx(int major, int minor) const { return !ptx.empty() && (ptx.back() >= major * 10 + minor); } - bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const + bool hasEqualOrGreaterBin(int major, int minor) const { return !bin.empty() && (bin.back() >= major * 10 + minor); } - void CudaArch::fromStr(const string& set_as_str, vector& arr) + + private: + void fromStr(const string& set_as_str, vector& arr) { if (set_as_str.find_first_not_of(" ") == string::npos) return; @@ -501,25 +472,21 @@ namespace sort(arr.begin(), arr.end()); } - - class DeviceProps - { - public: - DeviceProps(); - ~DeviceProps(); - - cudaDeviceProp* get(int devID); - - private: - std::vector props_; - }; - DeviceProps::DeviceProps() + vector bin; + vector ptx; + vector features; + }; + + class DeviceProps + { + public: + DeviceProps() { props_.resize(10, 0); } - DeviceProps::~DeviceProps() + ~DeviceProps() { for (size_t i = 0; i < props_.size(); ++i) { @@ -529,7 +496,7 @@ namespace props_.clear(); } - cudaDeviceProp* DeviceProps::get(int devID) + cudaDeviceProp* get(int devID) { if (devID >= (int) props_.size()) props_.resize(devID + 5, 0); @@ -542,10 +509,92 @@ namespace return props_[devID]; } + private: + std::vector props_; + }; + + DeviceProps deviceProps; + + class CudaDeviceInfoFuncTable: DeviceInfoFuncTable + { + public: + size_t sharedMemPerBlock() const + { + return deviceProps.get(device_id_)->sharedMemPerBlock; + } + + void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const + { + int prevDeviceID = getDevice(); + if (prevDeviceID != device_id_) + setDevice(device_id_); + + cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); + + if (prevDeviceID != device_id_) + setDevice(prevDeviceID); + } - DeviceProps deviceProps; + size_t freeMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _freeMemory; + } + + size_t totalMemory() const + { + size_t _totalMemory, _freeMemory; + queryMemory(_totalMemory, _freeMemory); + return _totalMemory; + } + + bool supports(FeatureSet feature_set) const + { + int version = majorVersion_ * 10 + minorVersion_; + return version >= feature_set; + } + + bool isCompatible() const + { + // Check PTX compatibility + if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_)) + return true; + + // Check BIN compatibility + for (int i = minorVersion_; i >= 0; --i) + if (TargetArchs::hasBin(majorVersion_, i)) + return true; + + return false; + } + + void query() + { + const cudaDeviceProp* prop = deviceProps.get(device_id_); + + name_ = prop->name; + multi_processor_count_ = prop->multiProcessorCount; + majorVersion_ = prop->major; + minorVersion_ = prop->minor; + } + + private: + int device_id_; + + std::string name_; + int multi_processor_count_; + int majorVersion_; + int minorVersion_; + }; + + class CudaFuncTable : public GpuFuncTable + { + protected: + + const CudaArch cudaArch; - int convertSMVer2Cores(int major, int minor) + int convertSMVer2Cores(int major, int minor) const { // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM typedef struct { @@ -600,42 +649,42 @@ namespace cudaSafeCall( cudaDeviceReset() ); } - bool TargetArchs::builtWith(FeatureSet feature_set) const + bool builtWith(FeatureSet feature_set) const { return cudaArch.builtWith(feature_set); } - bool TargetArchs::has(int major, int minor) const + bool has(int major, int minor) const { return hasPtx(major, minor) || hasBin(major, minor); } - bool TargetArchs::hasPtx(int major, int minor) const + bool hasPtx(int major, int minor) const { return cudaArch.hasPtx(major, minor); } - bool TargetArchs::hasBin(int major, int minor) const + bool hasBin(int major, int minor) const { return cudaArch.hasBin(major, minor); } - bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const + bool hasEqualOrLessPtx(int major, int minor) const { return cudaArch.hasEqualOrLessPtx(major, minor); } - bool TargetArchs::hasEqualOrGreater(int major, int minor) const + bool hasEqualOrGreater(int major, int minor) const { return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor); } - bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const + bool hasEqualOrGreaterPtx(int major, int minor) const { return cudaArch.hasEqualOrGreaterPtx(major, minor); } - bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const + bool hasEqualOrGreaterBin(int major, int minor) const { return cudaArch.hasEqualOrGreaterBin(major, minor); } @@ -664,68 +713,7 @@ namespace return TargetArchs::builtWith(feature_set) && (version >= feature_set); } - - size_t sharedMemPerBlock() const - { - return deviceProps.get(device_id_)->sharedMemPerBlock; - } - - void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const - { - int prevDeviceID = getDevice(); - if (prevDeviceID != device_id_) - setDevice(device_id_); - - cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) ); - - if (prevDeviceID != device_id_) - setDevice(prevDeviceID); - } - - size_t freeMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _freeMemory; - } - - size_t totalMemory() const - { - size_t _totalMemory, _freeMemory; - queryMemory(_totalMemory, _freeMemory); - return _totalMemory; - } - - bool supports(FeatureSet feature_set) const - { - int version = majorVersion() * 10 + minorVersion(); - return version >= feature_set; - } - - bool isCompatible() const - { - // Check PTX compatibility - if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion())) - return true; - - // Check BIN compatibility - for (int i = minorVersion(); i >= 0; --i) - if (TargetArchs::hasBin(majorVersion(), i)) - return true; - - return false; - } - - void query() const - { - const cudaDeviceProp* prop = deviceProps.get(device_id_); - - name_ = prop->name; - multi_processor_count_ = prop->multiProcessorCount; - majorVersion_ = prop->major; - minorVersion_ = prop->minor; - } - + void printCudaDeviceInfo(int device) const { int count = getCudaEnabledDeviceCount(); @@ -864,16 +852,16 @@ namespace typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream); static const func_t funcs[7][4] = { - /* 8U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 8S */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask }, - /* 16U */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 16S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32S */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 32F */ {NppCopyMasked::call, cv::gpu::details::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, - /* 64F */ {cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask , cv::gpu::details::copyWithMask } + /* 8U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 8S */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask }, + /* 16U */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 16S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32S */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 32F */ {NppCopyMasked::call, cv::gpu::device::copyWithMask, NppCopyMasked::call, NppCopyMasked::call}, + /* 64F */ {cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask , cv::gpu::device::copyWithMask } }; - const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask; + const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask; func(src, dst, mask, 0); } @@ -971,7 +959,7 @@ namespace func(src, dst); } - void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const + void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const { CV_Assert(src.depth() <= CV_64F && src.channels() <= 4); CV_Assert(dst.depth() <= CV_64F); @@ -982,10 +970,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - cv::gpu::device::convertTo(src, dst, alpha, beta); + cv::gpu::device::convertTo(src, dst, alpha, beta, stream); } - void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const + void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const { if (mask.empty()) { @@ -1016,7 +1004,7 @@ namespace {NppSet::call, NppSet::call, cv::gpu::device::setTo , NppSet::call}, {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, {NppSet::call, cv::gpu::device::setTo , cv::gpu::device::setTo , NppSet::call}, - {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } + {cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo , cv::gpu::device::setTo } }; CV_Assert(m.depth() <= CV_64F && m.channels() <= 4); @@ -1027,7 +1015,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - funcs[m.depth()][m.channels() - 1](m, s); + if (stream) + cv::gpu::device::setTo(m, s, stream); + else + funcs[m.depth()][m.channels() - 1](m, s); } else { @@ -1051,7 +1042,10 @@ namespace CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double"); } - funcs[m.depth()][m.channels() - 1](m, s, mask); + if (stream) + cv::gpu::device::setTo(m, s, mask, stream); + else + funcs[m.depth()][m.channels() - 1](m, s, mask); } } @@ -1065,5 +1059,5 @@ namespace cudaFree(devPtr); } }; -} +#endif #endif \ No newline at end of file -- GitLab