From 8660e048bc12c348ccfc17d42e97ea7af3aa34b0 Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@itseez.com>
Date: Fri, 13 Dec 2013 17:28:29 +0400
Subject: [PATCH] Dynamic CUDA support library loading implemented for Linux.

Logical mistake in macro fixed;
DeviceInfo deligate reimplemented;
Build and warning fixes.
---
 modules/core/CMakeLists.txt                  |  64 +++-
 modules/core/cuda/CMakeLists.txt             |   3 +-
 modules/core/cuda/main.cpp                   |  29 +-
 modules/core/include/opencv2/core/gpumat.hpp |   3 +
 modules/core/src/gpumat.cpp                  |  93 ++++-
 modules/core/src/gpumat_cuda.hpp             | 382 +++++++++----------
 6 files changed, 352 insertions(+), 222 deletions(-)

diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 5951982926..a7a997f67b 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,29 +1,69 @@
 set(the_description "The Core Functionality")
 
+macro(ocv_glob_module_sources_no_cuda)
+  file(GLOB_RECURSE lib_srcs "src/*.cpp")
+  file(GLOB_RECURSE lib_int_hdrs "src/*.hpp" "src/*.h")
+  file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
+  file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+
+  set(cuda_objs "")
+  set(lib_cuda_hdrs "")
+  if(HAVE_CUDA)
+    ocv_include_directories(${CUDA_INCLUDE_DIRS})
+    file(GLOB lib_cuda_hdrs "src/cuda/*.hpp")
+  endif()
+
+  source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
+
+  file(GLOB cl_kernels "src/opencl/*.cl")
+  if(HAVE_opencv_ocl AND cl_kernels)
+    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
+  endif()
+
+  source_group("Include" FILES ${lib_hdrs})
+  source_group("Include\\detail" FILES ${lib_hdrs_detail})
+
+  ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail}
+                                 SOURCES ${lib_srcs} ${lib_int_hdrs} ${cuda_objs} ${lib_cuda_hdrs})
+endmacro()
+
+ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
+ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
+
 if(HAVE_WINRT)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /ZW /GS /Gm- /AI\"${WINDOWS_SDK_PATH}/References/CommonConfiguration/Neutral\" /AI\"${VISUAL_STUDIO_PATH}/vcpackages\"")
 endif()
 
-file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
-file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
-
-source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
-source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
-
 if(DYNAMIC_CUDA_SUPPORT)
   add_definitions(-DDYNAMIC_CUDA_SUPPORT)
+else()
+  add_definitions(-DUSE_CUDA)
 endif()
 
-ocv_add_module(core PRIVATE_REQUIRED ${ZLIB_LIBRARIES})
-ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
-
 if(HAVE_CUDA)
   ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/include")
   ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
 endif()
 
-ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
-                        HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
+file(GLOB lib_cuda_hdrs_detail "include/opencv2/${name}/cuda/detail/*.hpp" "include/opencv2/${name}/cuda/detail/*.h")
+
+source_group("Cuda Headers"         FILES ${lib_cuda_hdrs})
+source_group("Cuda Headers\\Detail" FILES ${lib_cuda_hdrs_detail})
+
+if (DYNAMIC_CUDA_SUPPORT)
+  ocv_glob_module_sources_no_cuda(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                                  HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+else()
+  ocv_glob_module_sources(SOURCES "${opencv_core_BINARY_DIR}/version_string.inc"
+                          HEADERS ${lib_cuda_hdrs} ${lib_cuda_hdrs_detail})
+endif()
 
 ocv_create_module()
 ocv_add_precompiled_headers(${the_module})
@@ -31,6 +71,6 @@ ocv_add_precompiled_headers(${the_module})
 ocv_add_accuracy_tests()
 ocv_add_perf_tests()
 
-if(DYNAMIC_CUDA_SUPPORT)
+if (DYNAMIC_CUDA_SUPPORT)
   add_subdirectory(cuda)
 endif()
diff --git a/modules/core/cuda/CMakeLists.txt b/modules/core/cuda/CMakeLists.txt
index 0b1c9428d3..72ecea7a4c 100644
--- a/modules/core/cuda/CMakeLists.txt
+++ b/modules/core/cuda/CMakeLists.txt
@@ -1,6 +1,5 @@
 project(opencv_core_cuda)
-set(HAVE_CUDA FALSE)
-add_definitions("-DHAVE_CUDA")
+add_definitions(-DUSE_CUDA)
 include_directories(${CUDA_INCLUDE_DIRS}
                     "../src/"
                     "../include/opencv2/core/"
diff --git a/modules/core/cuda/main.cpp b/modules/core/cuda/main.cpp
index c4b8cbe1db..26d4834201 100644
--- a/modules/core/cuda/main.cpp
+++ b/modules/core/cuda/main.cpp
@@ -1,6 +1,10 @@
+#include "cvconfig.h"
 #include "opencv2/core/core.hpp"
 #include "opencv2/core/gpumat.hpp"
 
+#include <stdio.h>
+#include <iostream>
+
 #ifdef HAVE_CUDA
 #include <cuda_runtime.h>
 #include <npp.h>
@@ -17,7 +21,30 @@
 #endif
 #endif
 
+using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 
-#include "gpumat_cuda.hpp"
\ No newline at end of file
+#include "gpumat_cuda.hpp"
+
+#ifdef HAVE_CUDA
+static CudaDeviceInfoFuncTable deviceInfoTable;
+static CudaFuncTable gpuTable;
+#else
+static EmptyDeviceInfoFuncTable deviceInfoTable;
+static EmptyFuncTable gpuTable;
+#endif
+
+extern "C" {
+   
+DeviceInfoFuncTable* deviceInfoFactory()
+{
+    return (DeviceInfoFuncTable*)&deviceInfoTable;
+}
+
+GpuFuncTable* gpuFactory()
+{
+    return (GpuFuncTable*)&gpuTable;
+}
+
+}
diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index b502102139..d62c8749b0 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -137,6 +137,9 @@ namespace cv { namespace gpu
         int deviceID() const { return device_id_; }
 
     private:
+        // Private section is fictive to preserve bin compatibility.
+        // Changes in the private fields there have no effects.
+        // see deligate code.
         void query();
 
         int device_id_;
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index 9a2e36cb62..f438dfd8b6 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -43,8 +43,9 @@
 #include "precomp.hpp"
 #include "opencv2/core/gpumat.hpp"
 #include <iostream>
+#include <dlfcn.h>
 
-#if defined(HAVE_CUDA)
+#if defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
     #include <cuda_runtime.h>
     #include <npp.h>
 
@@ -66,15 +67,81 @@ using namespace cv::gpu;
 
 #include "gpumat_cuda.hpp"
 
-namespace
+typedef GpuFuncTable* (*GpuFactoryType)();
+typedef DeviceInfoFuncTable* (*DeviceInfoFactoryType)();
+
+static GpuFactoryType gpuFactory = NULL;
+static DeviceInfoFactoryType deviceInfoFactory = NULL;
+
+static const std::string getCudaSupportLibName()
+{
+    return "libopencv_core_cuda.so";
+}
+
+static bool loadCudaSupportLib()
 {
-    const GpuFuncTable* gpuFuncTable()
+    void* handle;
+    const std::string name = getCudaSupportLibName();
+    handle = dlopen(name.c_str(), RTLD_LAZY);
+    if (!handle)
+        return false;
+
+    deviceInfoFactory = (DeviceInfoFactoryType)dlsym(handle, "deviceInfoFactory");
+    if (!deviceInfoFactory)
     {
-        static EmptyFuncTable funcTable;
-        return &funcTable;
+        dlclose(handle);
+        return false;
     }
+    
+    gpuFactory = (GpuFactoryType)dlsym(handle, "gpuFactory");
+    if (!gpuFactory)
+    {
+        dlclose(handle);
+        return false;
+    }
+
+    dlclose(handle);
+
+    return true;
 }
 
+static GpuFuncTable* gpuFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyFuncTable stub;
+   static GpuFuncTable* libFuncTable = loadCudaSupportLib() ? gpuFactory(): (GpuFuncTable*)&stub;
+   static GpuFuncTable *funcTable = libFuncTable ? libFuncTable : (GpuFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaFuncTable impl;
+   static GpuFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static GpuFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+static DeviceInfoFuncTable* deviceInfoFuncTable()
+{
+#ifdef DYNAMIC_CUDA_SUPPORT
+   static EmptyDeviceInfoFuncTable stub;
+   static DeviceInfoFuncTable* libFuncTable = loadCudaSupportLib() ? deviceInfoFactory(): (DeviceInfoFuncTable*)&stub;
+   static DeviceInfoFuncTable* funcTable = libFuncTable ? libFuncTable : (DeviceInfoFuncTable*)&stub;
+#else
+# ifdef USE_CUDA
+   static CudaDeviceInfoFuncTable impl;
+   static DeviceInfoFuncTable* funcTable = &impl;
+#else
+   static EmptyFuncTable stub;
+   static DeviceInfoFuncTable* funcTable = &stub;
+#endif
+#endif
+   return funcTable;
+}
+
+
 //////////////////////////////// Initialization & Info ////////////////////////
 
 int cv::gpu::getCudaEnabledDeviceCount() { return gpuFuncTable()->getCudaEnabledDeviceCount(); }
@@ -95,13 +162,13 @@ bool cv::gpu::TargetArchs::hasEqualOrGreater(int major, int minor) { return gpuF
 bool cv::gpu::TargetArchs::hasEqualOrGreaterPtx(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterPtx(major, minor); }
 bool cv::gpu::TargetArchs::hasEqualOrGreaterBin(int major, int minor) { return gpuFuncTable()->hasEqualOrGreaterBin(major, minor); }
 
-size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return gpuFuncTable()->sharedMemPerBlock(); }
-void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { gpuFuncTable()->queryMemory(total_memory, free_memory); }
-size_t cv::gpu::DeviceInfo::freeMemory() const { return gpuFuncTable()->freeMemory(); }
-size_t cv::gpu::DeviceInfo::totalMemory() const { return gpuFuncTable()->totalMemory(); }
-bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return gpuFuncTable()->supports(feature_set); }
-bool cv::gpu::DeviceInfo::isCompatible() const { return gpuFuncTable()->isCompatible(); }
-void cv::gpu::DeviceInfo::query() { gpuFuncTable()->query(); }
+size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const { return deviceInfoFuncTable()->sharedMemPerBlock(); }
+void cv::gpu::DeviceInfo::queryMemory(size_t& total_memory, size_t& free_memory) const { deviceInfoFuncTable()->queryMemory(total_memory, free_memory); }
+size_t cv::gpu::DeviceInfo::freeMemory() const { return deviceInfoFuncTable()->freeMemory(); }
+size_t cv::gpu::DeviceInfo::totalMemory() const { return deviceInfoFuncTable()->totalMemory(); }
+bool cv::gpu::DeviceInfo::supports(FeatureSet feature_set) const { return deviceInfoFuncTable()->supports(feature_set); }
+bool cv::gpu::DeviceInfo::isCompatible() const { return deviceInfoFuncTable()->isCompatible(); }
+void cv::gpu::DeviceInfo::query() { deviceInfoFuncTable()->query(); }
 
 void cv::gpu::printCudaDeviceInfo(int device) { gpuFuncTable()->printCudaDeviceInfo(device); }
 void cv::gpu::printShortCudaDeviceInfo(int device) { gpuFuncTable()->printShortCudaDeviceInfo(device); }
@@ -556,7 +623,7 @@ namespace cv { namespace gpu
     
     void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
     {
-        gpuFuncTable()->setTo(src, s, stream);
+        gpuFuncTable()->setTo(src, s, cv::gpu::GpuMat(), stream);
     }
     
     void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
diff --git a/modules/core/src/gpumat_cuda.hpp b/modules/core/src/gpumat_cuda.hpp
index 631d6ea8ca..56d626a5cc 100644
--- a/modules/core/src/gpumat_cuda.hpp
+++ b/modules/core/src/gpumat_cuda.hpp
@@ -1,30 +1,19 @@
-namespace
-{
-#if defined(HAVE_CUDA) && !defined(DYNAMIC_CUDA_SUPPORT)
-
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#ifndef __GPUMAT_CUDA_HPP__
+#define __GPUMAT_CUDA_HPP__
 
-    inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    class DeviceInfoFuncTable
     {
-        if (cudaSuccess != err)
-            cv::gpu::error(cudaGetErrorString(err), file, line, func);
-    }
-
-    inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-    {
-        if (err < 0)
-        {
-            std::ostringstream msg;
-            msg << "NPP API Call Error: " << err;
-            cv::gpu::error(msg.str().c_str(), file, line, func);
-        }
-    }
-#endif
-}
-
-namespace
-{
+    public:
+        virtual size_t sharedMemPerBlock() const = 0;
+        virtual void queryMemory(size_t&, size_t&) const = 0;
+        virtual size_t freeMemory() const = 0;
+        virtual size_t totalMemory() const = 0;
+        virtual bool supports(FeatureSet) const = 0;
+        virtual bool isCompatible() const = 0;
+        virtual void query() = 0;
+        virtual ~DeviceInfoFuncTable() {};
+    };
+    
     class GpuFuncTable
     {
     public:
@@ -40,6 +29,7 @@ namespace
 
         virtual bool deviceSupports(FeatureSet) const = 0;
 
+        // TargetArchs
         virtual bool builtWith(FeatureSet) const = 0;
         virtual bool has(int, int) const = 0;
         virtual bool hasPtx(int, int) const = 0;
@@ -49,14 +39,6 @@ namespace
         virtual bool hasEqualOrGreaterPtx(int, int) const = 0;
         virtual bool hasEqualOrGreaterBin(int, int) const = 0;
 
-        virtual size_t sharedMemPerBlock() const = 0;
-        virtual void queryMemory(size_t&, size_t&) const = 0;
-        virtual size_t freeMemory() const = 0;
-        virtual size_t totalMemory() const = 0;
-        virtual bool supports(FeatureSet) const = 0;
-        virtual bool isCompatible() const = 0;
-        virtual void query() const = 0;
-
         virtual void printCudaDeviceInfo(int) const = 0;
         virtual void printShortCudaDeviceInfo(int) const = 0;
         
@@ -72,17 +54,24 @@ namespace
         virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
 
         // for gpu::device::setTo funcs
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const = 0;
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const = 0;
         
         virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
         virtual void free(void* devPtr) const = 0;
     };
-}
 
-#if !defined(HAVE_CUDA) || defined(DYNAMIC_CUDA_SUPPORT)
-namespace
-{
+    class EmptyDeviceInfoFuncTable: public DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
+        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
+        size_t freeMemory() const { throw_nogpu; return 0; }
+        size_t totalMemory() const { throw_nogpu; return 0; }
+        bool supports(FeatureSet) const { throw_nogpu; return false; }
+        bool isCompatible() const { throw_nogpu; return false; }
+        void query() { throw_nogpu; }
+    };
+    
     class EmptyFuncTable : public GpuFuncTable
     {
     public:
@@ -105,15 +94,7 @@ namespace
         bool hasEqualOrGreater(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterPtx(int, int) const { throw_nogpu; return false; }
         bool hasEqualOrGreaterBin(int, int) const { throw_nogpu; return false; }
-        
-        size_t sharedMemPerBlock() const { throw_nogpu; return 0; }
-        void queryMemory(size_t&, size_t&) const { throw_nogpu; }
-        size_t freeMemory() const { throw_nogpu; return 0; }
-        size_t totalMemory() const { throw_nogpu; return 0; }
-        bool supports(FeatureSet) const { throw_nogpu; return false; }
-        bool isCompatible() const { throw_nogpu; return false; }
-        void query() const { throw_nogpu; }
-        
+                
         void printCudaDeviceInfo(int) const { throw_nogpu; }
         void printShortCudaDeviceInfo(int) const { throw_nogpu; }
         
@@ -126,15 +107,32 @@ namespace
         void convert(const GpuMat&, GpuMat&) const { throw_nogpu; }
         void convert(const GpuMat&, GpuMat&, double, double, cudaStream_t stream = 0) const { (void)stream; throw_nogpu; }
 
-        virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*) const { throw_nogpu; }
         virtual void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*) const { throw_nogpu; }
 
         void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu; }
         void free(void*) const {}
     };
+
+#if defined(USE_CUDA)
+
+#define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, CV_Func)
+#define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, CV_Func)
+
+inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }
 
-#else
+inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+    {
+        std::ostringstream msg;
+        msg << "NPP API Call Error: " << err;
+        cv::gpu::error(msg.str().c_str(), file, line, func);
+    }
+}
 
 namespace cv { namespace gpu { namespace device
 {
@@ -149,8 +147,6 @@ namespace cv { namespace gpu { namespace device
     void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}
 
-namespace
-{
     template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
     {
         Scalar_<T> sf = s;
@@ -162,10 +158,7 @@ namespace
         Scalar_<T> sf = s;
         cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
     }
-}
 
-namespace
-{
     template<int n> struct NPPTypeTraits;
     template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
     template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
@@ -208,6 +201,7 @@ namespace
             cudaSafeCall( cudaDeviceSynchronize() );
         }
     };
+    
     template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
     {
         typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
@@ -361,9 +355,8 @@ namespace
     {
         return reinterpret_cast<size_t>(ptr) % size == 0;
     }
-}
      
-    namespace cv { namespace gpu { namespace devices
+    namespace cv { namespace gpu { namespace device
     {
         void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
         {
@@ -418,74 +411,52 @@ namespace
         {
             setTo(src, s, mask, 0);
         }
-    }}
+    }}}
 
-namespace
-{
-    class CudaFuncTable : public GpuFuncTable
+
+    class CudaArch
     {
-    protected:
-        
-        class CudaArch
-        {
-        public:
-            CudaArch();
-            
-            bool builtWith(FeatureSet feature_set) const;
-            bool hasPtx(int major, int minor) const;
-            bool hasBin(int major, int minor) const;
-            bool hasEqualOrLessPtx(int major, int minor) const;
-            bool hasEqualOrGreaterPtx(int major, int minor) const;
-            bool hasEqualOrGreaterBin(int major, int minor) const;
-            
-        private:
-            static void fromStr(const string& set_as_str, vector<int>& arr);
-            
-            vector<int> bin;
-            vector<int> ptx;
-            vector<int> features;
-        };
-        
-        const CudaArch cudaArch;
-        
-        CudaArch::CudaArch()
+    public:
+        CudaArch()
         {
             fromStr(CUDA_ARCH_BIN, bin);
             fromStr(CUDA_ARCH_PTX, ptx);
             fromStr(CUDA_ARCH_FEATURES, features);
         }
         
-        bool CudaArch::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
         {
             return !features.empty() && (features.back() >= feature_set);
         }
         
-        bool CudaArch::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
         {
             return find(ptx.begin(), ptx.end(), major * 10 + minor) != ptx.end();
         }
         
-        bool CudaArch::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
         {
             return find(bin.begin(), bin.end(), major * 10 + minor) != bin.end();
         }
         
-        bool CudaArch::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.front() <= major * 10 + minor);
         }
         
-        bool CudaArch::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return !ptx.empty() && (ptx.back() >= major * 10 + minor);
         }
         
-        bool CudaArch::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return !bin.empty() && (bin.back() >= major * 10 + minor);
         }
         
-        void CudaArch::fromStr(const string& set_as_str, vector<int>& arr)
+        
+    private:
+        void fromStr(const string& set_as_str, vector<int>& arr)
         {
             if (set_as_str.find_first_not_of(" ") == string::npos)
                 return;
@@ -501,25 +472,21 @@ namespace
             
             sort(arr.begin(), arr.end());
         }
-
-        class DeviceProps
-        {
-        public:
-            DeviceProps();
-            ~DeviceProps();
-            
-            cudaDeviceProp* get(int devID);
-            
-        private:
-            std::vector<cudaDeviceProp*> props_;
-        };
         
-        DeviceProps::DeviceProps()
+        vector<int> bin;
+        vector<int> ptx;
+        vector<int> features;
+    };
+
+    class DeviceProps
+    {
+    public:
+        DeviceProps()
         {
             props_.resize(10, 0);
         }
         
-        DeviceProps::~DeviceProps()
+        ~DeviceProps()
         {
             for (size_t i = 0; i < props_.size(); ++i)
             {
@@ -529,7 +496,7 @@ namespace
             props_.clear();
         }
         
-        cudaDeviceProp* DeviceProps::get(int devID)
+        cudaDeviceProp* get(int devID)
         {
             if (devID >= (int) props_.size())
                 props_.resize(devID + 5, 0);
@@ -542,10 +509,92 @@ namespace
             
             return props_[devID];
         }
+    private:
+        std::vector<cudaDeviceProp*> props_;
+    };
+
+    DeviceProps deviceProps;
+    
+    class CudaDeviceInfoFuncTable: DeviceInfoFuncTable
+    {
+    public:
+        size_t sharedMemPerBlock() const
+        {
+            return deviceProps.get(device_id_)->sharedMemPerBlock;
+        }
+        
+        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
+        {
+            int prevDeviceID = getDevice();
+            if (prevDeviceID != device_id_)
+                setDevice(device_id_);
+            
+            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
+            
+            if (prevDeviceID != device_id_)
+                setDevice(prevDeviceID);
+        }
         
-        DeviceProps deviceProps;
+        size_t freeMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _freeMemory;
+        }
+        
+        size_t totalMemory() const
+        {
+            size_t _totalMemory, _freeMemory;
+            queryMemory(_totalMemory, _freeMemory);
+            return _totalMemory;
+        }
+        
+        bool supports(FeatureSet feature_set) const
+        {
+            int version = majorVersion_ * 10 + minorVersion_;
+            return version >= feature_set;
+        }
+        
+        bool isCompatible() const
+        {
+            // Check PTX compatibility
+            if (TargetArchs::hasEqualOrLessPtx(majorVersion_, minorVersion_))
+                return true;
+            
+            // Check BIN compatibility
+                for (int i = minorVersion_; i >= 0; --i)
+                    if (TargetArchs::hasBin(majorVersion_, i))
+                        return true;
+                    
+                    return false;
+        }
+        
+        void query()
+        {
+            const cudaDeviceProp* prop = deviceProps.get(device_id_);
+            
+            name_ = prop->name;
+            multi_processor_count_ = prop->multiProcessorCount;
+            majorVersion_ = prop->major;
+            minorVersion_ = prop->minor;
+        }
+
+    private:
+        int device_id_;
+        
+        std::string name_;
+        int multi_processor_count_;
+        int majorVersion_;
+        int minorVersion_;
+    };
+    
+    class CudaFuncTable : public GpuFuncTable
+    {
+    protected:
+              
+        const CudaArch cudaArch;
 
-        int convertSMVer2Cores(int major, int minor)
+        int convertSMVer2Cores(int major, int minor) const
         {
             // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
             typedef struct {
@@ -600,42 +649,42 @@ namespace
             cudaSafeCall( cudaDeviceReset() );
         }
         
-        bool TargetArchs::builtWith(FeatureSet feature_set) const
+        bool builtWith(FeatureSet feature_set) const
         {
             return cudaArch.builtWith(feature_set);
         }
         
-        bool TargetArchs::has(int major, int minor) const
+        bool has(int major, int minor) const
         {
             return hasPtx(major, minor) || hasBin(major, minor);
         }
         
-        bool TargetArchs::hasPtx(int major, int minor) const
+        bool hasPtx(int major, int minor) const
         {
             return cudaArch.hasPtx(major, minor);
         }
         
-        bool TargetArchs::hasBin(int major, int minor) const
+        bool hasBin(int major, int minor) const
         {
             return cudaArch.hasBin(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrLessPtx(int major, int minor) const
+        bool hasEqualOrLessPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrLessPtx(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreater(int major, int minor) const
+        bool hasEqualOrGreater(int major, int minor) const
         {
             return hasEqualOrGreaterPtx(major, minor) || hasEqualOrGreaterBin(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreaterPtx(int major, int minor) const
+        bool hasEqualOrGreaterPtx(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterPtx(major, minor);
         }
         
-        bool TargetArchs::hasEqualOrGreaterBin(int major, int minor) const
+        bool hasEqualOrGreaterBin(int major, int minor) const
         {
             return cudaArch.hasEqualOrGreaterBin(major, minor);
         }
@@ -664,68 +713,7 @@ namespace
             
             return TargetArchs::builtWith(feature_set) && (version >= feature_set);
         }
-        
-        size_t sharedMemPerBlock() const
-        {
-            return deviceProps.get(device_id_)->sharedMemPerBlock;
-        }
-        
-        void queryMemory(size_t& _totalMemory, size_t& _freeMemory) const
-        {
-            int prevDeviceID = getDevice();
-            if (prevDeviceID != device_id_)
-                setDevice(device_id_);
-            
-            cudaSafeCall( cudaMemGetInfo(&_freeMemory, &_totalMemory) );
-            
-            if (prevDeviceID != device_id_)
-                setDevice(prevDeviceID);
-        }
-        
-        size_t freeMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _freeMemory;
-        }
-        
-        size_t totalMemory() const
-        {
-            size_t _totalMemory, _freeMemory;
-            queryMemory(_totalMemory, _freeMemory);
-            return _totalMemory;
-        }
-        
-        bool supports(FeatureSet feature_set) const
-        {
-            int version = majorVersion() * 10 + minorVersion();
-            return version >= feature_set;
-        }
-        
-        bool isCompatible() const
-        {
-            // Check PTX compatibility
-            if (TargetArchs::hasEqualOrLessPtx(majorVersion(), minorVersion()))
-                return true;
-            
-            // Check BIN compatibility
-                for (int i = minorVersion(); i >= 0; --i)
-                    if (TargetArchs::hasBin(majorVersion(), i))
-                        return true;
-                    
-                    return false;
-        }
-        
-        void query() const
-        {
-            const cudaDeviceProp* prop = deviceProps.get(device_id_);
-            
-            name_ = prop->name;
-            multi_processor_count_ = prop->multiProcessorCount;
-            majorVersion_ = prop->major;
-            minorVersion_ = prop->minor;
-        }
-                
+                        
         void printCudaDeviceInfo(int device) const
         {
             int count = getCudaEnabledDeviceCount();
@@ -864,16 +852,16 @@ namespace
             typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
             static const func_t funcs[7][4] =
             {
-                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
-                /*  8S */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         },
-                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
-                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
-                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
-                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::details::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
-                /* 64F */ {cv::gpu::details::copyWithMask                , cv::gpu::details::copyWithMask, cv::gpu::details::copyWithMask                         , cv::gpu::details::copyWithMask                         }
+                /*  8U */ {NppCopyMasked<CV_8U , nppiCopy_8u_C1MR >::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyMasked<CV_8U , nppiCopy_8u_C4MR >::call},
+                /*  8S */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         },
+                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::call},
+                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::call},
+                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::call},
+                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::call, cv::gpu::device::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::call},
+                /* 64F */ {cv::gpu::device::copyWithMask                ,  cv::gpu::device::copyWithMask, cv::gpu::device::copyWithMask                 , cv::gpu::device::copyWithMask                         }
             };
 
-            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::details::copyWithMask;
+            const func_t func =  mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cv::gpu::device::copyWithMask;
 
             func(src, dst, mask, 0);
         }
@@ -971,7 +959,7 @@ namespace
             func(src, dst);
         }
 
-        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream) const
         {
             CV_Assert(src.depth() <= CV_64F && src.channels() <= 4);
             CV_Assert(dst.depth() <= CV_64F);
@@ -982,10 +970,10 @@ namespace
                     CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
             }
 
-            cv::gpu::device::convertTo(src, dst, alpha, beta);
+            cv::gpu::device::convertTo(src, dst, alpha, beta, stream);
         }
 
-        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream) const
         {
             if (mask.empty())
             {
@@ -1016,7 +1004,7 @@ namespace
                     {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cv::gpu::device::setTo                        , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
                     {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
                     {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                 , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
+                    {cv::gpu::device::setTo                  , cv::gpu::device::setTo                  , cv::gpu::device::setTo                        , cv::gpu::device::setTo                          }
                 };
 
                 CV_Assert(m.depth() <= CV_64F && m.channels() <= 4);
@@ -1027,7 +1015,10 @@ namespace
                         CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                 }
 
-                funcs[m.depth()][m.channels() - 1](m, s);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s);
             }
             else
             {
@@ -1051,7 +1042,10 @@ namespace
                         CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
                 }
 
-                funcs[m.depth()][m.channels() - 1](m, s, mask);
+                if (stream)
+                    cv::gpu::device::setTo(m, s, mask, stream);
+                else
+                    funcs[m.depth()][m.channels() - 1](m, s, mask);
             }
         }
 
@@ -1065,5 +1059,5 @@ namespace
             cudaFree(devPtr);
         }
     };
-}
+#endif
 #endif
\ No newline at end of file
-- 
GitLab