/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the OpenCV Foundation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #include "precomp.hpp" #ifndef HAVE_OPENCL #include "ocl_disabled.impl.hpp" #else // HAVE_OPENCL #include #include #include #include #include #include #include // std::cerr #include #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700) #include #endif #include #include #undef CV_LOG_STRIP_LEVEL #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1 #include #include "opencv2/core/ocl_genbase.hpp" #include "opencl_kernels_core.hpp" #include "opencv2/core/utils/lock.private.hpp" #include "opencv2/core/utils/filesystem.hpp" #include "opencv2/core/utils/filesystem.private.hpp" #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__) #include "opencv2/core/utils/allocator_stats.impl.hpp" #undef CV__ALLOCATOR_STATS_LOG #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0 #define CV_OPENCL_SHOW_RUN_KERNELS 0 #define CV_OPENCL_TRACE_CHECK 0 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1 #define CV_OPENCL_SHOW_SVM_ERROR_LOG 1 #define CV_OPENCL_SHOW_SVM_LOG 0 #include "opencv2/core/bufferpool.hpp" #ifndef LOG_BUFFER_POOL # if 0 # define LOG_BUFFER_POOL printf # else # define LOG_BUFFER_POOL(...) # endif #endif #if CV_OPENCL_SHOW_SVM_LOG // TODO add timestamp logging #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf #else #define CV_OPENCL_SVM_TRACE_P(...) #endif #if CV_OPENCL_SHOW_SVM_ERROR_LOG // TODO add timestamp logging #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf #else #define CV_OPENCL_SVM_TRACE_ERROR_P(...) #endif #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp" #include "opencv2/core/opencl/runtime/opencl_core.hpp" #ifdef HAVE_OPENCL_SVM #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp" #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp" #include "opencv2/core/opencl/opencl_svm.hpp" #endif #include "umatrix.hpp" namespace cv { namespace ocl { #define IMPLEMENT_REFCOUNTABLE() \ void addref() { CV_XADD(&refcount, 1); } \ void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \ int refcount static cv::utils::AllocatorStatistics opencl_allocator_stats; CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics(); cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics() { return opencl_allocator_stats; } #ifndef _DEBUG static bool isRaiseError() { static bool initialized = false; static bool value = false; if (!initialized) { value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false); initialized = true; } return value; } #endif #if CV_OPENCL_TRACE_CHECK static inline void traceOpenCLCheck(cl_int status, const char* message) { std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush; } #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message) #else #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */ #endif #define CV_OCL_API_ERROR_MSG(check_result, msg) \ cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg) #define CV_OCL_CHECK_RESULT(check_result, msg) \ do { \ CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \ if (check_result != CL_SUCCESS) \ { \ static_assert(std::is_convertible::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \ cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \ CV_Error(Error::OpenCLApiCallError, error_msg); \ } \ } while (0) #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0) #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0) #ifdef _DEBUG #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg) #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr) #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result) #else #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \ do { \ CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \ if (check_result != CL_SUCCESS && isRaiseError()) \ { \ static_assert(std::is_convertible::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \ cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \ CV_Error(Error::OpenCLApiCallError, error_msg); \ } \ } while (0) #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0) #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0) #endif static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true); static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true); static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true); static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true); #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false); #endif // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS", #ifdef __APPLE__ true #else false #endif ); static const String getBuildExtraOptions() { static String param_buildExtraOptions; static bool initialized = false; if (!initialized) { param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", ""); initialized = true; if (!param_buildExtraOptions.empty()) CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'"); } return param_buildExtraOptions; } static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true); static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4); struct UMat2D { UMat2D(const UMat& m) { offset = (int)m.offset; step = (int)m.step; rows = m.rows; cols = m.cols; } int offset; int step; int rows; int cols; }; struct UMat3D { UMat3D(const UMat& m) { offset = (int)m.offset; step = (int)m.step.p[1]; slicestep = (int)m.step.p[0]; slices = (int)m.size.p[0]; rows = m.size.p[1]; cols = m.size.p[2]; } int offset; int slicestep; int step; int slices; int rows; int cols; }; // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 ) { static uint64 table[256]; static bool initialized = false; if( !initialized ) { for( int i = 0; i < 256; i++ ) { uint64 c = i; for( int j = 0; j < 8; j++ ) c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1); table[i] = c; } initialized = true; } uint64 crc = ~crc0; for( size_t idx = 0; idx < size; idx++ ) crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8); return ~crc; } #if OPENCV_HAVE_FILESYSTEM_SUPPORT struct OpenCLBinaryCacheConfigurator { cv::String cache_path_; cv::String cache_lock_filename_; cv::Ptr cache_lock_; typedef std::map ContextCacheType; ContextCacheType prepared_contexts_; Mutex mutex_prepared_contexts_; OpenCLBinaryCacheConfigurator() { CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration..."); if (!CV_OPENCL_CACHE_ENABLE) { CV_LOG_INFO(NULL, "OpenCL cache is disabled"); return; } cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR"); if (cache_path_.empty()) { CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache"); } do { try { if (cache_path_.empty()) break; if (cache_path_ == "disabled") break; if (!utils::fs::createDirectories(cache_path_)) { CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_); clear(); break; } if (CV_OPENCL_CACHE_LOCK_ENABLE) { cache_lock_filename_ = cache_path_ + ".lock"; if (!utils::fs::exists(cache_lock_filename_)) { CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")"); std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out); if (!lock_filename.is_open()) { CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_); break; } } try { cache_lock_ = makePtr(cache_lock_filename_.c_str()); CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")"); { utils::shared_lock_guard lock(*cache_lock_); } CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!"); } catch (const cv::Exception& e) { CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what()); } catch (...) { CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_); } } else { if (CV_OPENCL_CACHE_WRITE) { CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed " "(not safe for multiprocess environment)"); } else { CV_LOG_INFO(NULL, "OpenCL cache lock is disabled"); } } } catch (const cv::Exception& e) { CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what()); clear(); } } while (0); if (!cache_path_.empty()) { if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE) { CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. " "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled"); } else { CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_); } } } void clear() { cache_path_.clear(); cache_lock_filename_.clear(); cache_lock_.release(); } std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix, const std::string& cleanup_prefix) { if (cache_path_.empty()) return std::string(); AutoLock lock(mutex_prepared_contexts_); ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix); if (found_it != prepared_contexts_.end()) return found_it->second; CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix); std::string target_directory = cache_path_ + ctx_prefix + "/"; bool result = utils::fs::isDirectory(target_directory); if (!result) { try { CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory); if (utils::fs::createDirectories(target_directory)) { result = true; } else { CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory); } } catch (const cv::Exception& e) { CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what()); } } target_directory = result ? target_directory : std::string(); prepared_contexts_.insert(std::pair(ctx_prefix, target_directory)); if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty()) { try { std::vector entries; utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true); std::vector remove_entries; for (size_t i = 0; i < entries.size(); i++) { const String& name = entries[i]; if (0 == name.find(cleanup_prefix)) { if (0 == name.find(ctx_prefix)) continue; // skip current remove_entries.push_back(name); } } if (!remove_entries.empty()) { CV_LOG_WARNING(NULL, (remove_entries.size() == 1 ? "Detected OpenCL cache directory for other version of OpenCL device." : "Detected OpenCL cache directories for other versions of OpenCL device.") << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade."); CV_LOG_WARNING(NULL, "Trying to remove these directories..."); for (size_t i = 0; i < remove_entries.size(); i++) { CV_LOG_WARNING(NULL, "- " << remove_entries[i]); } CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0"); for (size_t i = 0; i < remove_entries.size(); i++) { const String& name = remove_entries[i]; cv::String path = utils::fs::join(cache_path_, name); try { utils::fs::remove_all(path); CV_LOG_WARNING(NULL, "Removed: " << path); } catch (const cv::Exception& e) { CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what()); } } } } catch (...) { CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories"); } } CV_LOG_VERBOSE(NULL, 1, " Result: " << (target_directory.empty() ? std::string("Failed") : target_directory)); return target_directory; } static OpenCLBinaryCacheConfigurator& getSingletonInstance() { CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator()); } }; class BinaryProgramFile { enum { MAX_ENTRIES = 64 }; typedef unsigned int uint32_t; struct CV_DECL_ALIGNED(4) FileHeader { uint32_t sourceSignatureSize; //char sourceSignature[]; }; struct CV_DECL_ALIGNED(4) FileTable { uint32_t numberOfEntries; //uint32_t firstEntryOffset[]; }; struct CV_DECL_ALIGNED(4) FileEntry { uint32_t nextEntryFileOffset; // 0 for the last entry in chain uint32_t keySize; uint32_t dataSize; //char key[]; //char data[]; }; const std::string fileName_; const char* const sourceSignature_; const size_t sourceSignatureSize_; std::fstream f; uint32_t entryOffsets[MAX_ENTRIES]; uint32_t getHash(const std::string& options) { uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0); return hash & (MAX_ENTRIES - 1); } inline size_t getFileSize() { size_t pos = (size_t)f.tellg(); f.seekg(0, std::fstream::end); size_t fileSize = (size_t)f.tellg(); f.seekg(pos, std::fstream::beg); return fileSize; } inline uint32_t readUInt32() { uint32_t res = 0; f.read((char*)&res, sizeof(uint32_t)); CV_Assert(!f.fail()); return res; } inline void writeUInt32(const uint32_t value) { uint32_t v = value; f.write((char*)&v, sizeof(uint32_t)); CV_Assert(!f.fail()); } inline void seekReadAbsolute(size_t pos) { f.seekg(pos, std::fstream::beg); CV_Assert(!f.fail()); } inline void seekReadRelative(size_t pos) { f.seekg(pos, std::fstream::cur); CV_Assert(!f.fail()); } inline void seekWriteAbsolute(size_t pos) { f.seekp(pos, std::fstream::beg); CV_Assert(!f.fail()); } void clearFile() { f.close(); if (0 != remove(fileName_.c_str())) CV_LOG_ERROR(NULL, "Can't remove: " << fileName_); return; } public: BinaryProgramFile(const std::string& fileName, const char* sourceSignature) : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0) { CV_StaticAssert(sizeof(uint32_t) == 4, ""); CV_Assert(sourceSignature_ != NULL); CV_Assert(sourceSignatureSize_ > 0); memset(entryOffsets, 0, sizeof(entryOffsets)); f.rdbuf()->pubsetbuf(0, 0); // disable buffering f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary); if(f.is_open() && getFileSize() > 0) { bool isValid = false; try { uint32_t fileSourceSignatureSize = readUInt32(); if (fileSourceSignatureSize == sourceSignatureSize_) { cv::AutoBuffer fileSourceSignature(fileSourceSignatureSize + 1); f.read(fileSourceSignature.data(), fileSourceSignatureSize); if (f.eof()) { CV_LOG_ERROR(NULL, "Unexpected EOF"); } else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0) { isValid = true; } } if (!isValid) { CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)"); } } catch (const cv::Exception& e) { CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what()); } catch (...) { CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error"); } if (!isValid) { clearFile(); } else { seekReadAbsolute(0); } } } bool read(const std::string& key, std::vector& buf) { if (!f.is_open()) return false; size_t fileSize = getFileSize(); if (fileSize == 0) { CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_); clearFile(); return false; } seekReadAbsolute(0); // bypass FileHeader uint32_t fileSourceSignatureSize = readUInt32(); CV_Assert(fileSourceSignatureSize > 0); seekReadRelative(fileSourceSignatureSize); uint32_t numberOfEntries = readUInt32(); CV_Assert(numberOfEntries > 0); if (numberOfEntries != MAX_ENTRIES) { CV_LOG_ERROR(NULL, "Invalid file: " << fileName_); clearFile(); return false; } f.read((char*)&entryOffsets[0], sizeof(entryOffsets)); CV_Assert(!f.fail()); uint32_t entryNum = getHash(key); uint32_t entryOffset = entryOffsets[entryNum]; FileEntry entry; while (entryOffset > 0) { seekReadAbsolute(entryOffset); //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, ""); f.read((char*)&entry, sizeof(entry)); CV_Assert(!f.fail()); cv::AutoBuffer fileKey(entry.keySize + 1); if (key.size() == entry.keySize) { if (entry.keySize > 0) { f.read(fileKey.data(), entry.keySize); CV_Assert(!f.fail()); } if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0) { buf.resize(entry.dataSize); f.read(&buf[0], entry.dataSize); CV_Assert(!f.fail()); seekReadAbsolute(0); CV_LOG_VERBOSE(NULL, 0, "Read..."); return true; } } if (entry.nextEntryFileOffset == 0) break; entryOffset = entry.nextEntryFileOffset; } return false; } bool write(const std::string& key, std::vector& buf) { if (!f.is_open()) { f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary); if (!f.is_open()) { f.open(fileName_.c_str(), std::ios::out|std::ios::binary); if (!f.is_open()) { CV_LOG_ERROR(NULL, "Can't create file: " << fileName_); return false; } } } size_t fileSize = getFileSize(); if (fileSize == 0) { // Write header seekWriteAbsolute(0); writeUInt32((uint32_t)sourceSignatureSize_); f.write(sourceSignature_, sourceSignatureSize_); CV_Assert(!f.fail()); writeUInt32(MAX_ENTRIES); memset(entryOffsets, 0, sizeof(entryOffsets)); f.write((char*)entryOffsets, sizeof(entryOffsets)); CV_Assert(!f.fail()); f.flush(); CV_Assert(!f.fail()); f.close(); f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary); CV_Assert(f.is_open()); fileSize = getFileSize(); } seekReadAbsolute(0); // bypass FileHeader uint32_t fileSourceSignatureSize = readUInt32(); CV_Assert(fileSourceSignatureSize == sourceSignatureSize_); seekReadRelative(fileSourceSignatureSize); uint32_t numberOfEntries = readUInt32(); CV_Assert(numberOfEntries > 0); if (numberOfEntries != MAX_ENTRIES) { CV_LOG_ERROR(NULL, "Invalid file: " << fileName_); clearFile(); return false; } size_t tableEntriesOffset = (size_t)f.tellg(); f.read((char*)&entryOffsets[0], sizeof(entryOffsets)); CV_Assert(!f.fail()); uint32_t entryNum = getHash(key); uint32_t entryOffset = entryOffsets[entryNum]; FileEntry entry; while (entryOffset > 0) { seekReadAbsolute(entryOffset); //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, ""); f.read((char*)&entry, sizeof(entry)); CV_Assert(!f.fail()); cv::AutoBuffer fileKey(entry.keySize + 1); if (key.size() == entry.keySize) { if (entry.keySize > 0) { f.read(fileKey.data(), entry.keySize); CV_Assert(!f.fail()); } if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize)) { // duplicate CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_); return false; } } if (entry.nextEntryFileOffset == 0) break; entryOffset = entry.nextEntryFileOffset; } seekReadAbsolute(0); if (entryOffset > 0) { seekWriteAbsolute(entryOffset); entry.nextEntryFileOffset = (uint32_t)fileSize; f.write((char*)&entry, sizeof(entry)); CV_Assert(!f.fail()); } else { entryOffsets[entryNum] = (uint32_t)fileSize; seekWriteAbsolute(tableEntriesOffset); f.write((char*)entryOffsets, sizeof(entryOffsets)); CV_Assert(!f.fail()); } seekWriteAbsolute(fileSize); entry.nextEntryFileOffset = 0; entry.dataSize = (uint32_t)buf.size(); entry.keySize = (uint32_t)key.size(); f.write((char*)&entry, sizeof(entry)); CV_Assert(!f.fail()); f.write(key.c_str(), entry.keySize); CV_Assert(!f.fail()); f.write(&buf[0], entry.dataSize); CV_Assert(!f.fail()); f.flush(); CV_Assert(!f.fail()); CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)"); return true; } }; #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT struct OpenCLExecutionContext::Impl { ocl::Context context_; int device_; // device index in context ocl::Queue queue_; int useOpenCL_; protected: Impl() = delete; void _init_device(cl_device_id deviceID) { CV_Assert(deviceID); int ndevices = (int)context_.ndevices(); CV_Assert(ndevices > 0); bool found = false; for (int i = 0; i < ndevices; i++) { ocl::Device d = context_.device(i); cl_device_id dhandle = (cl_device_id)d.ptr(); if (dhandle == deviceID) { device_ = i; found = true; break; } } CV_Assert(found && "OpenCL device can't work with passed OpenCL context"); } void _init_device(const ocl::Device& device) { CV_Assert(device.ptr()); int ndevices = (int)context_.ndevices(); CV_Assert(ndevices > 0); bool found = false; for (int i = 0; i < ndevices; i++) { ocl::Device d = context_.device(i); if (d.getImpl() == device.getImpl()) { device_ = i; found = true; break; } } CV_Assert(found && "OpenCL device can't work with passed OpenCL context"); } public: Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID) : device_(0), useOpenCL_(-1) { CV_UNUSED(platformID); CV_Assert(context); CV_Assert(deviceID); context_ = Context::fromHandle(context); _init_device(deviceID); queue_ = Queue(context_, context_.device(device_)); } Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue) : device_(0), useOpenCL_(-1) { CV_Assert(context.ptr()); CV_Assert(device.ptr()); context_ = context; _init_device(device); queue_ = queue; } Impl(const ocl::Context& context, const ocl::Device& device) : device_(0), useOpenCL_(-1) { CV_Assert(context.ptr()); CV_Assert(device.ptr()); context_ = context; _init_device(device); queue_ = Queue(context_, context_.device(device_)); } Impl(const ocl::Context& context, const int device, const ocl::Queue& queue) : context_(context) , device_(device) , queue_(queue) , useOpenCL_(-1) { // nothing } Impl(const Impl& other) : context_(other.context_) , device_(other.device_) , queue_(other.queue_) , useOpenCL_(-1) { // nothing } inline bool useOpenCL() const { return const_cast(this)->useOpenCL(); } bool useOpenCL() { if (useOpenCL_ < 0) { try { useOpenCL_ = 0; if (!context_.empty() && context_.ndevices() > 0) { const Device& d = context_.device(device_); useOpenCL_ = d.available(); } } catch (const cv::Exception&) { // nothing } if (!useOpenCL_) CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context"); } return useOpenCL_ > 0; } void setUseOpenCL(bool flag) { if (!flag) useOpenCL_ = 0; else useOpenCL_ = -1; } static const std::shared_ptr& getInitializedExecutionContext() { CV_TRACE_FUNCTION(); CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context"); static bool initialized = false; static std::shared_ptr g_primaryExecutionContext; if (!initialized) { cv::AutoLock lock(getInitializationMutex()); if (!initialized) { CV_LOG_INFO(NULL, "OpenCL: creating new execution context..."); try { Context c = ocl::Context::create(std::string()); if (c.ndevices()) { int deviceId = 0; auto& d = c.device(deviceId); if (d.available()) { auto q = ocl::Queue(c, d); if (!q.ptr()) { CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue"); } else { g_primaryExecutionContext = std::make_shared(c, deviceId, q); CV_LOG_INFO(NULL, "OpenCL: device=" << d.name()); } } else { CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)"); } } else { CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled"); } } catch (const std::exception& e) { CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what()); } catch (...) { CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception"); } initialized = true; } } return g_primaryExecutionContext; } }; Context& OpenCLExecutionContext::getContext() const { CV_Assert(p); return p->context_; } Device& OpenCLExecutionContext::getDevice() const { CV_Assert(p); return p->context_.device(p->device_); } Queue& OpenCLExecutionContext::getQueue() const { CV_Assert(p); return p->queue_; } bool OpenCLExecutionContext::useOpenCL() const { if (p) return p->useOpenCL(); return false; } void OpenCLExecutionContext::setUseOpenCL(bool flag) { CV_Assert(p); p->setUseOpenCL(flag); } /* static */ OpenCLExecutionContext& OpenCLExecutionContext::getCurrent() { CV_TRACE_FUNCTION(); CoreTLSData& data = getCoreTlsData(); OpenCLExecutionContext& c = data.oclExecutionContext; if (!data.oclExecutionContextInitialized) { data.oclExecutionContextInitialized = true; if (c.empty() && haveOpenCL()) c.p = Impl::getInitializedExecutionContext(); } return c; } /* static */ OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef() { CV_TRACE_FUNCTION(); CoreTLSData& data = getCoreTlsData(); OpenCLExecutionContext& c = data.oclExecutionContext; return c; } void OpenCLExecutionContext::bind() const { CV_TRACE_FUNCTION(); CV_Assert(p); CoreTLSData& data = getCoreTlsData(); data.oclExecutionContext = *this; data.oclExecutionContextInitialized = true; data.useOpenCL = p->useOpenCL_; // propagate "-1", avoid call useOpenCL() } OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const { CV_TRACE_FUNCTION(); CV_Assert(p); const Queue q(getContext(), getDevice()); return cloneWithNewQueue(q); } OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const { CV_TRACE_FUNCTION(); CV_Assert(p); CV_Assert(q.ptr() != NULL); OpenCLExecutionContext c; c.p = std::make_shared(p->context_, p->device_, q); return c; } /* static */ OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue) { CV_TRACE_FUNCTION(); if (!haveOpenCL()) CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!"); CV_Assert(!context.empty()); CV_Assert(context.ptr()); CV_Assert(!device.empty()); CV_Assert(device.ptr()); OpenCLExecutionContext ctx; ctx.p = std::make_shared(context, device, queue); return ctx; } /* static */ OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device) { CV_TRACE_FUNCTION(); if (!haveOpenCL()) CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!"); CV_Assert(!context.empty()); CV_Assert(context.ptr()); CV_Assert(!device.empty()); CV_Assert(device.ptr()); OpenCLExecutionContext ctx; ctx.p = std::make_shared(context, device); return ctx; } void OpenCLExecutionContext::release() { CV_TRACE_FUNCTION(); p.reset(); } // true if we have initialized OpenCL subsystem with available platforms static bool g_isOpenCLInitialized = false; static bool g_isOpenCLAvailable = false; bool haveOpenCL() { CV_TRACE_FUNCTION(); if (!g_isOpenCLInitialized) { CV_TRACE_REGION("Init_OpenCL_Runtime"); const char* envPath = getenv("OPENCV_OPENCL_RUNTIME"); if (envPath) { if (cv::String(envPath) == "disabled") { g_isOpenCLAvailable = false; g_isOpenCLInitialized = true; return false; } } cv::AutoLock lock(getInitializationMutex()); CV_LOG_INFO(NULL, "Initialize OpenCL runtime..."); try { cl_uint n = 0; g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS; g_isOpenCLAvailable &= n > 0; CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms"); } catch (...) { g_isOpenCLAvailable = false; } g_isOpenCLInitialized = true; } return g_isOpenCLAvailable; } bool useOpenCL() { CoreTLSData& data = getCoreTlsData(); if (data.useOpenCL < 0) { try { data.useOpenCL = 0; if (haveOpenCL()) { auto c = OpenCLExecutionContext::getCurrent(); data.useOpenCL = c.useOpenCL(); } } catch (...) { CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context"); } } return data.useOpenCL > 0; } bool isOpenCLActivated() { if (!g_isOpenCLAvailable) return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls return useOpenCL(); } void setUseOpenCL(bool flag) { CV_TRACE_FUNCTION(); CoreTLSData& data = getCoreTlsData(); auto& c = OpenCLExecutionContext::getCurrentRef(); if (!c.empty()) { c.setUseOpenCL(flag); data.useOpenCL = c.useOpenCL(); } else { if (!flag) data.useOpenCL = 0; else data.useOpenCL = -1; // enabled by default (if context is not initialized) } } #ifdef HAVE_CLAMDBLAS class AmdBlasHelper { public: static AmdBlasHelper & getInstance() { CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper()) } bool isAvailable() const { return g_isAmdBlasAvailable; } ~AmdBlasHelper() { try { clAmdBlasTeardown(); } catch (...) { } } protected: AmdBlasHelper() { if (!g_isAmdBlasInitialized) { AutoLock lock(getInitializationMutex()); if (!g_isAmdBlasInitialized) { if (haveOpenCL()) { try { g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess; } catch (...) { g_isAmdBlasAvailable = false; } } else g_isAmdBlasAvailable = false; g_isAmdBlasInitialized = true; } } } private: static bool g_isAmdBlasInitialized; static bool g_isAmdBlasAvailable; }; bool AmdBlasHelper::g_isAmdBlasAvailable = false; bool AmdBlasHelper::g_isAmdBlasInitialized = false; bool haveAmdBlas() { return AmdBlasHelper::getInstance().isAvailable(); } #else bool haveAmdBlas() { return false; } #endif #ifdef HAVE_CLAMDFFT class AmdFftHelper { public: static AmdFftHelper & getInstance() { CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper()) } bool isAvailable() const { return g_isAmdFftAvailable; } ~AmdFftHelper() { try { // clAmdFftTeardown(); } catch (...) { } } protected: AmdFftHelper() { if (!g_isAmdFftInitialized) { AutoLock lock(getInitializationMutex()); if (!g_isAmdFftInitialized) { if (haveOpenCL()) { try { cl_uint major, minor, patch; CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS); // it throws exception in case AmdFft binaries are not found CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS); g_isAmdFftAvailable = true; } catch (const Exception &) { g_isAmdFftAvailable = false; } } else g_isAmdFftAvailable = false; g_isAmdFftInitialized = true; } } } private: static clAmdFftSetupData setupData; static bool g_isAmdFftInitialized; static bool g_isAmdFftAvailable; }; clAmdFftSetupData AmdFftHelper::setupData; bool AmdFftHelper::g_isAmdFftAvailable = false; bool AmdFftHelper::g_isAmdFftInitialized = false; bool haveAmdFft() { return AmdFftHelper::getInstance().isAvailable(); } #else bool haveAmdFft() { return false; } #endif bool haveSVM() { #ifdef HAVE_OPENCL_SVM return true; #else return false; #endif } void finish() { Queue::getDefault().finish(); } /////////////////////////////////////////// Platform ///////////////////////////////////////////// struct Platform::Impl { Impl() { refcount = 1; handle = 0; initialized = false; } ~Impl() {} void init() { if( !initialized ) { //cl_uint num_entries cl_uint n = 0; if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 ) handle = 0; if( handle != 0 ) { char buf[1000]; size_t len = 0; CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len)); buf[len] = '\0'; vendor = String(buf); } initialized = true; } } IMPLEMENT_REFCOUNTABLE(); cl_platform_id handle; String vendor; bool initialized; }; Platform::Platform() CV_NOEXCEPT { p = 0; } Platform::~Platform() { if(p) p->release(); } Platform::Platform(const Platform& pl) { p = (Impl*)pl.p; if(p) p->addref(); } Platform& Platform::operator = (const Platform& pl) { Impl* newp = (Impl*)pl.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Platform::Platform(Platform&& pl) CV_NOEXCEPT { p = pl.p; pl.p = nullptr; } Platform& Platform::operator = (Platform&& pl) CV_NOEXCEPT { if (this != &pl) { if(p) p->release(); p = pl.p; pl.p = nullptr; } return *this; } void* Platform::ptr() const { return p ? p->handle : 0; } Platform& Platform::getDefault() { CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms"); static Platform p; if( !p.p ) { p.p = new Impl; p.p->init(); } return p; } /////////////////////////////////////// Device //////////////////////////////////////////// // Version has format: // OpenCL

// by specification // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html // https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html static void parseOpenCLVersion(const String &version, int &major, int &minor) { major = minor = 0; if (10 >= version.length()) return; const char *pstr = version.c_str(); if (0 != strncmp(pstr, "OpenCL ", 7)) return; size_t ppos = version.find('.', 7); if (String::npos == ppos) return; String temp = version.substr(7, ppos - 7); major = atoi(temp.c_str()); temp = version.substr(ppos + 1); minor = atoi(temp.c_str()); } struct Device::Impl { Impl(void* d) : refcount(1) , handle(0) { try { cl_device_id device = (cl_device_id)d; _init(device); CV_OCL_CHECK(clRetainDevice(device)); // increment reference counter on success only } catch (...) { throw; } } void _init(cl_device_id d) { handle = (cl_device_id)d; name_ = getStrProp(CL_DEVICE_NAME); version_ = getStrProp(CL_DEVICE_VERSION); extensions_ = getStrProp(CL_DEVICE_EXTENSIONS); doubleFPConfig_ = getProp(CL_DEVICE_DOUBLE_FP_CONFIG); hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY); maxComputeUnits_ = getProp(CL_DEVICE_MAX_COMPUTE_UNITS); maxWorkGroupSize_ = getProp(CL_DEVICE_MAX_WORK_GROUP_SIZE); type_ = getProp(CL_DEVICE_TYPE); driverVersion_ = getStrProp(CL_DRIVER_VERSION); addressBits_ = getProp(CL_DEVICE_ADDRESS_BITS); String deviceVersion_ = getStrProp(CL_DEVICE_VERSION); parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_); size_t pos = 0; while (pos < extensions_.size()) { size_t pos2 = extensions_.find(' ', pos); if (pos2 == String::npos) pos2 = extensions_.size(); if (pos2 > pos) { std::string extensionName = extensions_.substr(pos, pos2 - pos); extensions_set_.insert(extensionName); } pos = pos2 + 1; } intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups"); vendorName_ = getStrProp(CL_DEVICE_VENDOR); if (vendorName_ == "Advanced Micro Devices, Inc." || vendorName_ == "AMD") vendorID_ = VENDOR_AMD; else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0) vendorID_ = VENDOR_INTEL; else if (vendorName_ == "NVIDIA Corporation") vendorID_ = VENDOR_NVIDIA; else vendorID_ = UNKNOWN_VENDOR; const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0); if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0) { const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE); if (new_maxWorkGroupSize != maxWorkGroupSize_) CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")"); maxWorkGroupSize_ = new_maxWorkGroupSize; } #if 0 if (isExtensionSupported("cl_khr_spir")) { #ifndef CL_DEVICE_SPIR_VERSIONS #define CL_DEVICE_SPIR_VERSIONS 0x40E0 #endif cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS); std::cout << spir_versions << std::endl; } #endif } ~Impl() { #ifdef _WIN32 if (!cv::__termination) #endif { if (handle) { CV_OCL_CHECK(clReleaseDevice(handle)); handle = 0; } } } template _TpOut getProp(cl_device_info prop) const { _TpCL temp=_TpCL(); size_t sz = 0; return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS && sz == sizeof(temp) ? _TpOut(temp) : _TpOut(); } bool getBoolProp(cl_device_info prop) const { cl_bool temp = CL_FALSE; size_t sz = 0; return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS && sz == sizeof(temp) ? temp != 0 : false; } String getStrProp(cl_device_info prop) const { char buf[4096]; size_t sz=0; return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS && sz < sizeof(buf) ? String(buf) : String(); } bool isExtensionSupported(const std::string& extensionName) const { return extensions_set_.count(extensionName) > 0; } IMPLEMENT_REFCOUNTABLE(); cl_device_id handle; String name_; String version_; std::string extensions_; int doubleFPConfig_; bool hostUnifiedMemory_; int maxComputeUnits_; size_t maxWorkGroupSize_; int type_; int addressBits_; int deviceVersionMajor_; int deviceVersionMinor_; String driverVersion_; String vendorName_; int vendorID_; bool intelSubgroupsSupport_; std::set extensions_set_; }; Device::Device() CV_NOEXCEPT { p = 0; } Device::Device(void* d) { p = 0; set(d); } Device::Device(const Device& d) { p = d.p; if(p) p->addref(); } Device& Device::operator = (const Device& d) { Impl* newp = (Impl*)d.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Device::Device(Device&& d) CV_NOEXCEPT { p = d.p; d.p = nullptr; } Device& Device::operator = (Device&& d) CV_NOEXCEPT { if (this != &d) { if(p) p->release(); p = d.p; d.p = nullptr; } return *this; } Device::~Device() { if(p) p->release(); } void Device::set(void* d) { if(p) p->release(); p = new Impl(d); if (p->handle) { CV_OCL_CHECK(clReleaseDevice((cl_device_id)d)); } } Device Device::fromHandle(void* d) { Device device(d); return device; } void* Device::ptr() const { return p ? p->handle : 0; } String Device::name() const { return p ? p->name_ : String(); } String Device::extensions() const { return p ? String(p->extensions_) : String(); } bool Device::isExtensionSupported(const String& extensionName) const { return p ? p->isExtensionSupported(extensionName) : false; } String Device::version() const { return p ? p->version_ : String(); } String Device::vendorName() const { return p ? p->vendorName_ : String(); } int Device::vendorID() const { return p ? p->vendorID_ : 0; } String Device::OpenCL_C_Version() const { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); } String Device::OpenCLVersion() const { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); } int Device::deviceVersionMajor() const { return p ? p->deviceVersionMajor_ : 0; } int Device::deviceVersionMinor() const { return p ? p->deviceVersionMinor_ : 0; } String Device::driverVersion() const { return p ? p->driverVersion_ : String(); } int Device::type() const { return p ? p->type_ : 0; } int Device::addressBits() const { return p ? p->addressBits_ : 0; } bool Device::available() const { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; } bool Device::compilerAvailable() const { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; } bool Device::linkerAvailable() const #ifdef CL_VERSION_1_2 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif int Device::doubleFPConfig() const { return p ? p->doubleFPConfig_ : 0; } int Device::singleFPConfig() const { return p ? p->getProp(CL_DEVICE_SINGLE_FP_CONFIG) : 0; } int Device::halfFPConfig() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_HALF_FP_CONFIG) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif bool Device::endianLittle() const { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; } bool Device::errorCorrectionSupport() const { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; } int Device::executionCapabilities() const { return p ? p->getProp(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; } size_t Device::globalMemCacheSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; } int Device::globalMemCacheType() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; } int Device::globalMemCacheLineSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; } size_t Device::globalMemSize() const { return p ? p->getProp(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; } size_t Device::localMemSize() const { return p ? p->getProp(CL_DEVICE_LOCAL_MEM_SIZE) : 0; } int Device::localMemType() const { return p ? p->getProp(CL_DEVICE_LOCAL_MEM_TYPE) : 0; } bool Device::hostUnifiedMemory() const { return p ? p->hostUnifiedMemory_ : false; } bool Device::imageSupport() const { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; } bool Device::imageFromBufferSupport() const { return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false; } uint Device::imagePitchAlignment() const { #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT return p ? p->getProp(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0; #else return 0; #endif } uint Device::imageBaseAddressAlignment() const { #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT return p ? p->getProp(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0; #else return 0; #endif } size_t Device::image2DMaxWidth() const { return p ? p->getProp(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; } size_t Device::image2DMaxHeight() const { return p ? p->getProp(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; } size_t Device::image3DMaxWidth() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; } size_t Device::image3DMaxHeight() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; } size_t Device::image3DMaxDepth() const { return p ? p->getProp(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; } size_t Device::imageMaxBufferSize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif size_t Device::imageMaxArraySize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif bool Device::intelSubgroupsSupport() const { return p ? p->intelSubgroupsSupport_ : false; } int Device::maxClockFrequency() const { return p ? p->getProp(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; } int Device::maxComputeUnits() const { return p ? p->maxComputeUnits_ : 0; } int Device::maxConstantArgs() const { return p ? p->getProp(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; } size_t Device::maxConstantBufferSize() const { return p ? p->getProp(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; } size_t Device::maxMemAllocSize() const { return p ? p->getProp(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; } size_t Device::maxParameterSize() const { return p ? p->getProp(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; } int Device::maxReadImageArgs() const { return p ? p->getProp(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; } int Device::maxWriteImageArgs() const { return p ? p->getProp(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; } int Device::maxSamplers() const { return p ? p->getProp(CL_DEVICE_MAX_SAMPLERS) : 0; } size_t Device::maxWorkGroupSize() const { return p ? p->maxWorkGroupSize_ : 0; } int Device::maxWorkItemDims() const { return p ? p->getProp(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; } void Device::maxWorkItemSizes(size_t* sizes) const { if(p) { const int MAX_DIMS = 32; size_t retsz = 0; CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES, MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz)); } } int Device::memBaseAddrAlign() const { return p ? p->getProp(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; } int Device::nativeVectorWidthChar() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; } int Device::nativeVectorWidthShort() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; } int Device::nativeVectorWidthInt() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; } int Device::nativeVectorWidthLong() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; } int Device::nativeVectorWidthFloat() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; } int Device::nativeVectorWidthDouble() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; } int Device::nativeVectorWidthHalf() const { return p ? p->getProp(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; } int Device::preferredVectorWidthChar() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; } int Device::preferredVectorWidthShort() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; } int Device::preferredVectorWidthInt() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; } int Device::preferredVectorWidthLong() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; } int Device::preferredVectorWidthFloat() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; } int Device::preferredVectorWidthDouble() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; } int Device::preferredVectorWidthHalf() const { return p ? p->getProp(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; } size_t Device::printfBufferSize() const #ifdef CL_VERSION_1_2 { return p ? p->getProp(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; } #else { CV_REQUIRE_OPENCL_1_2_ERROR; } #endif size_t Device::profilingTimerResolution() const { return p ? p->getProp(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; } const Device& Device::getDefault() { auto& c = OpenCLExecutionContext::getCurrent(); if (!c.empty()) { return c.getDevice(); } static Device dummy; return dummy; } ////////////////////////////////////// Context /////////////////////////////////////////////////// template inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param) { ::size_t required; cl_int err = f(obj, name, 0, NULL, &required); if (err != CL_SUCCESS) return err; param.clear(); if (required > 0) { AutoBuffer buf(required + 1); char* ptr = buf.data(); // cleanup is not needed err = f(obj, name, required, ptr, NULL); if (err != CL_SUCCESS) return err; param = ptr; } return CL_SUCCESS; } static void split(const std::string &s, char delim, std::vector &elems) { elems.clear(); if (s.size() == 0) return; std::istringstream ss(s); std::string item; while (!ss.eof()) { std::getline(ss, item, delim); elems.push_back(item); } } // Layout: :: // Sample: AMD:GPU: // Sample: AMD:GPU:Tahiti // Sample: :GPU|CPU: = '' = ':' = '::' static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr, std::string& platform, std::vector& deviceTypes, std::string& deviceNameOrID) { std::vector parts; split(configurationStr, ':', parts); if (parts.size() > 3) { CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr); return false; } if (parts.size() > 2) deviceNameOrID = parts[2]; if (parts.size() > 1) { split(parts[1], '|', deviceTypes); } if (parts.size() > 0) { platform = parts[0]; } return true; } #if defined WINRT || defined _WIN32_WCE static cl_device_id selectOpenCLDevice(const char* configuration = NULL) { CV_UNUSED(configuration) return NULL; } #else static cl_device_id selectOpenCLDevice(const char* configuration = NULL) { std::string platform, deviceName; std::vector deviceTypes; if (!configuration) configuration = getenv("OPENCV_OPENCL_DEVICE"); if (configuration && (strcmp(configuration, "disabled") == 0 || !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName) )) return NULL; bool isID = false; int deviceID = -1; if (deviceName.length() == 1) // We limit ID range to 0..9, because we want to write: // - '2500' to mean i5-2500 // - '8350' to mean AMD FX-8350 // - '650' to mean GeForce 650 // To extend ID range change condition to '> 0' { isID = true; for (size_t i = 0; i < deviceName.length(); i++) { if (!isdigit(deviceName[i])) { isID = false; break; } } if (isID) { deviceID = atoi(deviceName.c_str()); if (deviceID < 0) return NULL; } } std::vector platforms; { cl_uint numPlatforms = 0; CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms)); if (numPlatforms == 0) return NULL; platforms.resize((size_t)numPlatforms); CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms)); platforms.resize(numPlatforms); } int selectedPlatform = -1; if (platform.length() > 0) { for (size_t i = 0; i < platforms.size(); i++) { std::string name; CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name)); if (name.find(platform) != std::string::npos) { selectedPlatform = (int)i; break; } } if (selectedPlatform == -1) { CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform); goto not_found; } } if (deviceTypes.size() == 0) { if (!isID) { deviceTypes.push_back("GPU"); if (configuration) deviceTypes.push_back("CPU"); } else deviceTypes.push_back("ALL"); } for (size_t t = 0; t < deviceTypes.size(); t++) { int deviceType = 0; std::string tempStrDeviceType = deviceTypes[t]; std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower); if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu") deviceType = Device::TYPE_GPU; else if (tempStrDeviceType == "cpu") deviceType = Device::TYPE_CPU; else if (tempStrDeviceType == "accelerator") deviceType = Device::TYPE_ACCELERATOR; else if (tempStrDeviceType == "all") deviceType = Device::TYPE_ALL; else { CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]); goto not_found; } std::vector devices; // TODO Use clReleaseDevice to cleanup for (int i = selectedPlatform >= 0 ? selectedPlatform : 0; (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size()); i++) { cl_uint count = 0; cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count); if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) { CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count"); } if (count == 0) continue; size_t base = devices.size(); devices.resize(base + count); status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count); if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND)) { CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs"); } } for (size_t i = (isID ? deviceID : 0); (isID ? (i == (size_t)deviceID) : true) && (i < devices.size()); i++) { std::string name; CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name)); cl_bool useGPU = true; if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu") { cl_bool isIGPU = CL_FALSE; CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL)); useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU; } if ( (isID || name.find(deviceName) != std::string::npos) && useGPU) { // TODO check for OpenCL 1.1 return devices[i]; } } } not_found: if (!configuration) return NULL; // suppress messages on stderr std::ostringstream msg; msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl << " Device types:"; for (size_t t = 0; t < deviceTypes.size(); t++) msg << ' ' << deviceTypes[t]; msg << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName); CV_LOG_ERROR(NULL, msg.str()); return NULL; } #endif #ifdef HAVE_OPENCL_SVM namespace svm { enum AllocatorFlags { // don't use first 16 bits OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access OPENCL_SVM_BUFFER_MASK = 3 << 16, OPENCL_SVM_BUFFER_MAP = 4 << 16 }; static bool checkForceSVMUmatUsage() { static bool initialized = false; static bool force = false; if (!initialized) { force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false); initialized = true; } return force; } static bool checkDisableSVMUMatUsage() { static bool initialized = false; static bool force = false; if (!initialized) { force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false); initialized = true; } return force; } static bool checkDisableSVM() { static bool initialized = false; static bool force = false; if (!initialized) { force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false); initialized = true; } return force; } // see SVMCapabilities static unsigned int getSVMCapabilitiesMask() { static bool initialized = false; static unsigned int mask = 0; if (!initialized) { const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK"); if (envValue == NULL) { return ~0U; // all bits 1 } mask = atoi(envValue); initialized = true; } return mask; } } // namespace #endif static size_t getProgramCountLimit() { static bool initialized = false; static size_t count = 0; if (!initialized) { count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0); initialized = true; } return count; } static int g_contextId = 0; class OpenCLBufferPoolImpl; class OpenCLSVMBufferPoolImpl; struct Context::Impl { static Context::Impl* get(Context& context) { return context.p; } typedef std::deque container_t; static container_t& getGlobalContainer() { // never delete this container (Impl lifetime is greater due to TLS storage) static container_t* g_contexts = new container_t(); return *g_contexts; } protected: Impl(const std::string& configuration_) : refcount(1) , contextId(CV_XADD(&g_contextId, 1)) , configuration(configuration_) , handle(0) #ifdef HAVE_OPENCL_SVM , svmInitialized(false) #endif { if (!haveOpenCL()) CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!"); cv::AutoLock lock(cv::getInitializationMutex()); auto& container = getGlobalContainer(); container.resize(std::max(container.size(), (size_t)contextId + 1)); container[contextId] = this; } ~Impl() { #ifdef _WIN32 if (!cv::__termination) #endif { if (handle) { CV_OCL_DBG_CHECK(clReleaseContext(handle)); handle = NULL; } devices.clear(); } userContextStorage.clear(); { cv::AutoLock lock(cv::getInitializationMutex()); auto& container = getGlobalContainer(); CV_CheckLT((size_t)contextId, container.size(), ""); container[contextId] = NULL; } } void init_device_list() { CV_Assert(handle); cl_uint ndevices = 0; CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL)); CV_Assert(ndevices > 0); cv::AutoBuffer cl_devices(ndevices); size_t devices_ret_size = 0; CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size)); CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), ""); devices.clear(); for (unsigned i = 0; i < ndevices; i++) { devices.emplace_back(Device::fromHandle(cl_devices[i])); } } void __init_buffer_pools(); // w/o synchronization void _init_buffer_pools() const { if (!bufferPool_) { cv::AutoLock lock(cv::getInitializationMutex()); if (!bufferPool_) { const_cast(this)->__init_buffer_pools(); } } } public: static Impl* findContext(const std::string& configuration) { CV_TRACE_FUNCTION(); cv::AutoLock lock(cv::getInitializationMutex()); auto& container = getGlobalContainer(); if (configuration.empty() && !container.empty()) return container[0]; for (auto it = container.begin(); it != container.end(); ++it) { Impl* i = *it; if (i && i->configuration == configuration) { return i; } } return NULL; } static Impl* findOrCreateContext(const std::string& configuration_) { CV_TRACE_FUNCTION(); std::string configuration = configuration_; if (configuration_.empty()) { const char* c = getenv("OPENCV_OPENCL_DEVICE"); if (c) configuration = c; } Impl* impl = findContext(configuration); if (impl) { CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration) impl->addref(); return impl; } cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str()); if (d == NULL) return NULL; impl = new Impl(configuration); try { impl->createFromDevice(d); if (impl->handle) return impl; delete impl; return NULL; } catch (...) { delete impl; throw; } } static Impl* findOrCreateContext(cl_context h) { CV_TRACE_FUNCTION(); CV_Assert(h); std::string configuration = cv::format("@ctx-%p", (void*)h); Impl* impl = findContext(configuration); if (impl) { CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration) impl->addref(); return impl; } impl = new Impl(configuration); try { CV_OCL_CHECK(clRetainContext(h)); impl->handle = h; impl->init_device_list(); return impl; } catch (...) { delete impl; throw; } } static Impl* findOrCreateContext(const ocl::Device& device) { CV_TRACE_FUNCTION(); CV_Assert(!device.empty()); cl_device_id d = (cl_device_id)device.ptr(); CV_Assert(d); std::string configuration = cv::format("@dev-%p", (void*)d); Impl* impl = findContext(configuration); if (impl) { CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration) impl->addref(); return impl; } impl = new Impl(configuration); try { impl->createFromDevice(d); CV_Assert(impl->handle); return impl; } catch (...) { delete impl; throw; } } void setDefault() { CV_TRACE_FUNCTION(); cl_device_id d = selectOpenCLDevice(); if (d == NULL) return; createFromDevice(d); } void createFromDevice(cl_device_id d) { CV_TRACE_FUNCTION(); CV_Assert(handle == NULL); cl_platform_id pl = NULL; CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL)); cl_context_properties prop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)pl, 0 }; // !!! in the current implementation force the number of devices to 1 !!! cl_uint nd = 1; cl_int status; handle = clCreateContext(prop, nd, &d, 0, 0, &status); CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext"); bool ok = handle != 0 && status == CL_SUCCESS; if( ok ) { devices.resize(nd); devices[0].set(d); } else handle = NULL; } Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg); void unloadProg(Program& prog) { cv::AutoLock lock(program_cache_mutex); for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i) { phash_t::iterator it = phash.find(*i); if (it != phash.end()) { if (it->second.ptr() == prog.ptr()) { phash.erase(*i); cacheList.erase(i); return; } } } } std::string& getPrefixString() { if (prefix.empty()) { cv::AutoLock lock(program_cache_mutex); if (prefix.empty()) { CV_Assert(!devices.empty()); const Device& d = devices[0]; int bits = d.addressBits(); if (bits > 0 && bits != 64) prefix = cv::format("%d-bit--", bits); prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion(); // sanitize chars for (size_t i = 0; i < prefix.size(); i++) { char c = prefix[i]; if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-')) { prefix[i] = '_'; } } } } return prefix; } std::string& getPrefixBase() { if (prefix_base.empty()) { cv::AutoLock lock(program_cache_mutex); if (prefix_base.empty()) { const Device& d = devices[0]; int bits = d.addressBits(); if (bits > 0 && bits != 64) prefix_base = cv::format("%d-bit--", bits); prefix_base += d.vendorName() + "--" + d.name() + "--"; // sanitize chars for (size_t i = 0; i < prefix_base.size(); i++) { char c = prefix_base[i]; if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-')) { prefix_base[i] = '_'; } } } } return prefix_base; } IMPLEMENT_REFCOUNTABLE(); const int contextId; // global unique ID const std::string configuration; cl_context handle; std::vector devices; std::string prefix; std::string prefix_base; cv::Mutex program_cache_mutex; typedef std::map phash_t; phash_t phash; typedef std::list CacheList; CacheList cacheList; std::shared_ptr bufferPool_; std::shared_ptr bufferPoolHostPtr_; OpenCLBufferPoolImpl& getBufferPool() const { _init_buffer_pools(); CV_DbgAssert(bufferPool_); return *bufferPool_.get(); } OpenCLBufferPoolImpl& getBufferPoolHostPtr() const { _init_buffer_pools(); CV_DbgAssert(bufferPoolHostPtr_); return *bufferPoolHostPtr_.get(); } std::map> userContextStorage; cv::Mutex userContextMutex; void setUserContext(std::type_index typeId, const std::shared_ptr& userContext) { cv::AutoLock lock(userContextMutex); userContextStorage[typeId] = userContext; } std::shared_ptr getUserContext(std::type_index typeId) { cv::AutoLock lock(userContextMutex); auto it = userContextStorage.find(typeId); if (it != userContextStorage.end()) return it->second; else return nullptr; } #ifdef HAVE_OPENCL_SVM bool svmInitialized; bool svmAvailable; bool svmEnabled; svm::SVMCapabilities svmCapabilities; svm::SVMFunctions svmFunctions; void svmInit() { CV_Assert(handle != NULL); const Device& device = devices[0]; cl_device_svm_capabilities deviceCaps = 0; CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL); if (status != CL_SUCCESS) { CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status); goto noSVM; } CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps); CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption svmCapabilities.value_ = ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) | ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) | ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) | ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0); svmCapabilities.value_ &= svm::getSVMCapabilitiesMask(); if (svmCapabilities.value_ == 0) { CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n"); goto noSVM; } try { // Try OpenCL 2.0 CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n"); void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0); if (!ptr) { CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n"); CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL"); } try { bool error = false; cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL)) { CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n"); CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED"); } clFinish(q); try { ((int*)ptr)[0] = 100; } catch (...) { CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n"); error = true; } if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL)) { CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n"); CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED"); } clFinish(q); if (error) { CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED"); } } catch (...) { CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n"); clSVMFree(handle, ptr); throw; } clSVMFree(handle, ptr); svmFunctions.fn_clSVMAlloc = clSVMAlloc; svmFunctions.fn_clSVMFree = clSVMFree; svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer; //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo; //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree; svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy; svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill; svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap; svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap; } catch (...) { CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n"); try { // Try HSA extension String extensions = device.extensions(); if (extensions.find("cl_amd_svm") == String::npos) { CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str()); goto noSVM; } cl_platform_id p = NULL; CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL)); svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD"); svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD"); svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD"); //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD"); //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD"); svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD"); svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD"); svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD"); svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD"); CV_Assert(svmFunctions.isValid()); } catch (...) { CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n"); goto noSVM; } } svmAvailable = true; svmEnabled = !svm::checkDisableSVM(); svmInitialized = true; CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n"); return; noSVM: CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n"); svmAvailable = false; svmEnabled = false; svmCapabilities.value_ = 0; svmInitialized = true; svmFunctions.fn_clSVMAlloc = NULL; return; } std::shared_ptr bufferPoolSVM_; OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const { _init_buffer_pools(); CV_DbgAssert(bufferPoolSVM_); return *bufferPoolSVM_.get(); } #endif friend class Program; }; Context::Context() CV_NOEXCEPT { p = 0; } Context::~Context() { release(); } // deprecated Context::Context(int dtype) { p = 0; create(dtype); } void Context::release() { if (p) { p->release(); p = NULL; } } bool Context::create() { release(); if (!haveOpenCL()) return false; p = Impl::findOrCreateContext(std::string()); if (p && p->handle) return true; release(); return false; } // deprecated bool Context::create(int dtype) { if( !haveOpenCL() ) return false; release(); if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL) { p = Impl::findOrCreateContext(""); } else if (dtype == CL_DEVICE_TYPE_GPU) { p = Impl::findOrCreateContext(":GPU:"); } else if (dtype == CL_DEVICE_TYPE_CPU) { p = Impl::findOrCreateContext(":CPU:"); } else { CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype); } if (p && !p->handle) { release(); } return p != 0; } Context::Context(const Context& c) { p = (Impl*)c.p; if(p) p->addref(); } Context& Context::operator = (const Context& c) { Impl* newp = (Impl*)c.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Context::Context(Context&& c) CV_NOEXCEPT { p = c.p; c.p = nullptr; } Context& Context::operator = (Context&& c) CV_NOEXCEPT { if (this != &c) { if(p) p->release(); p = c.p; c.p = nullptr; } return *this; } void* Context::ptr() const { return p == NULL ? NULL : p->handle; } size_t Context::ndevices() const { return p ? p->devices.size() : 0; } Device& Context::device(size_t idx) const { static Device dummy; return !p || idx >= p->devices.size() ? dummy : p->devices[idx]; } Context& Context::getDefault(bool initialize) { auto& c = OpenCLExecutionContext::getCurrent(); if (!c.empty()) { auto& ctx = c.getContext(); return ctx; } CV_UNUSED(initialize); static Context dummy; return dummy; } Program Context::getProg(const ProgramSource& prog, const String& buildopts, String& errmsg) { return p ? p->getProg(prog, buildopts, errmsg) : Program(); } void Context::unloadProg(Program& prog) { if (p) p->unloadProg(prog); } /* static */ Context Context::fromHandle(void* context) { Context ctx; ctx.p = Impl::findOrCreateContext((cl_context)context); return ctx; } /* static */ Context Context::fromDevice(const ocl::Device& device) { Context ctx; ctx.p = Impl::findOrCreateContext(device); return ctx; } /* static */ Context Context::create(const std::string& configuration) { Context ctx; ctx.p = Impl::findOrCreateContext(configuration); return ctx; } void* Context::getOpenCLContextProperty(int propertyId) const { if (p == NULL) return nullptr; ::size_t size = 0; CV_OCL_CHECK(clGetContextInfo(p->handle, CL_CONTEXT_PROPERTIES, 0, NULL, &size)); std::vector prop(size / sizeof(cl_context_properties), (cl_context_properties)0); CV_OCL_CHECK(clGetContextInfo(p->handle, CL_CONTEXT_PROPERTIES, size, prop.data(), NULL)); for (size_t i = 0; i < prop.size(); i += 2) { if (prop[i] == (cl_context_properties)propertyId) { CV_LOG_DEBUG(NULL, "OpenCL: found context property=" << propertyId << ") => " << (void*)prop[i + 1]); return (void*)prop[i + 1]; } } return nullptr; } #ifdef HAVE_OPENCL_SVM bool Context::useSVM() const { Context::Impl* i = p; CV_Assert(i); if (!i->svmInitialized) i->svmInit(); return i->svmEnabled; } void Context::setUseSVM(bool enabled) { Context::Impl* i = p; CV_Assert(i); if (!i->svmInitialized) i->svmInit(); if (enabled && !i->svmAvailable) { CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device"); } i->svmEnabled = enabled; } #else bool Context::useSVM() const { return false; } void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); } #endif #ifdef HAVE_OPENCL_SVM namespace svm { const SVMCapabilities getSVMCapabilitites(const ocl::Context& context) { Context::Impl* i = context.p; CV_Assert(i); if (!i->svmInitialized) i->svmInit(); return i->svmCapabilities; } CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context) { Context::Impl* i = context.p; CV_Assert(i); CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL); return &i->svmFunctions; } CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags) { if (checkForceSVMUmatUsage()) return true; if (checkDisableSVMUMatUsage()) return false; if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0) return true; return false; // don't use SVM by default } } // namespace cv::ocl::svm #endif // HAVE_OPENCL_SVM Context::UserContext::~UserContext() { } void Context::setUserContext(std::type_index typeId, const std::shared_ptr& userContext) { CV_Assert(p); p->setUserContext(typeId, userContext); } std::shared_ptr Context::getUserContext(std::type_index typeId) { CV_Assert(p); return p->getUserContext(typeId); } static void get_platform_name(cl_platform_id id, String& name) { // get platform name string length size_t sz = 0; CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz)); // get platform name string AutoBuffer buf(sz + 1); CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0)); // just in case, ensure trailing zero for ASCIIZ string buf[sz] = 0; name = buf.data(); } /* // Attaches OpenCL context to OpenCV */ void attachContext(const String& platformName, void* platformID, void* context, void* deviceID) { auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID); ctx.bind(); } /* static */ OpenCLExecutionContext OpenCLExecutionContext::create( const std::string& platformName, void* platformID, void* context, void* deviceID ) { if (!haveOpenCL()) CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!"); cl_uint cnt = 0; CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt)); if (cnt == 0) CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!"); std::vector platforms(cnt); CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0)); bool platformAvailable = false; // check if external platformName contained in list of available platforms in OpenCV for (unsigned int i = 0; i < cnt; i++) { String availablePlatformName; get_platform_name(platforms[i], availablePlatformName); // external platform is found in the list of available platforms if (platformName == availablePlatformName) { platformAvailable = true; break; } } if (!platformAvailable) CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!"); // check if platformID corresponds to platformName String actualPlatformName; get_platform_name((cl_platform_id)platformID, actualPlatformName); if (platformName != actualPlatformName) CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!"); OpenCLExecutionContext ctx; ctx.p = std::make_shared((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID); CV_OCL_CHECK(clReleaseContext((cl_context)context)); CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID)); return ctx; } void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device) { // internal call, less checks cl_platform_id platformID = (cl_platform_id)_platform; cl_context context = (cl_context)_context; cl_device_id deviceID = (cl_device_id)_device; std::string platformName = PlatformInfo(&platformID).name(); auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID); CV_Assert(!clExecCtx.empty()); ctx = clExecCtx.getContext(); } /////////////////////////////////////////// Queue ///////////////////////////////////////////// struct Queue::Impl { inline void __init() { refcount = 1; handle = 0; isProfilingQueue_ = false; } Impl(cl_command_queue q) { __init(); handle = q; cl_command_queue_properties props = 0; CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL)); isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE); } Impl(cl_command_queue q, bool isProfilingQueue) { __init(); handle = q; isProfilingQueue_ = isProfilingQueue; } Impl(const Context& c, const Device& d, bool withProfiling = false) { __init(); const Context* pc = &c; cl_context ch = (cl_context)pc->ptr(); if( !ch ) { pc = &Context::getDefault(); ch = (cl_context)pc->ptr(); } cl_device_id dh = (cl_device_id)d.ptr(); if( !dh ) dh = (cl_device_id)pc->device(0).ptr(); cl_int retval = 0; cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0; CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval); isProfilingQueue_ = withProfiling; } ~Impl() { #ifdef _WIN32 if (!cv::__termination) #endif { if(handle) { CV_OCL_DBG_CHECK(clFinish(handle)); CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle)); handle = NULL; } } } const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self) { if (isProfilingQueue_) return self; if (profiling_queue_.ptr()) return profiling_queue_; cl_context ctx = 0; CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL)); cl_device_id device = 0; CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL)); cl_int result = CL_SUCCESS; cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result); CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)"); Queue queue; queue.p = new Impl(q, true); profiling_queue_ = queue; return profiling_queue_; } IMPLEMENT_REFCOUNTABLE(); cl_command_queue handle; bool isProfilingQueue_; cv::ocl::Queue profiling_queue_; }; Queue::Queue() CV_NOEXCEPT { p = 0; } Queue::Queue(const Context& c, const Device& d) { p = 0; create(c, d); } Queue::Queue(const Queue& q) { p = q.p; if(p) p->addref(); } Queue& Queue::operator = (const Queue& q) { Impl* newp = (Impl*)q.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Queue::Queue(Queue&& q) CV_NOEXCEPT { p = q.p; q.p = nullptr; } Queue& Queue::operator = (Queue&& q) CV_NOEXCEPT { if (this != &q) { if(p) p->release(); p = q.p; q.p = nullptr; } return *this; } Queue::~Queue() { if(p) p->release(); } bool Queue::create(const Context& c, const Device& d) { if(p) p->release(); p = new Impl(c, d); return p->handle != 0; } void Queue::finish() { if(p && p->handle) { CV_OCL_DBG_CHECK(clFinish(p->handle)); } } const Queue& Queue::getProfilingQueue() const { CV_Assert(p); return p->getProfilingQueue(*this); } void* Queue::ptr() const { return p ? p->handle : 0; } Queue& Queue::getDefault() { auto& c = OpenCLExecutionContext::getCurrent(); if (!c.empty()) { auto& q = c.getQueue(); return q; } static Queue dummy; return dummy; } static cl_command_queue getQueue(const Queue& q) { cl_command_queue qq = (cl_command_queue)q.ptr(); if(!qq) qq = (cl_command_queue)Queue::getDefault().ptr(); return qq; } /////////////////////////////////////////// KernelArg ///////////////////////////////////////////// KernelArg::KernelArg() CV_NOEXCEPT : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1) { } KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz) : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale) { CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL); } KernelArg KernelArg::Constant(const Mat& m) { CV_Assert(m.isContinuous()); return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize()); } /////////////////////////////////////////// Kernel ///////////////////////////////////////////// struct Kernel::Impl { Impl(const char* kname, const Program& prog) : refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0) { cl_program ph = (cl_program)prog.ptr(); cl_int retval = 0; name = kname; if (ph) { handle = clCreateKernel(ph, kname, &retval); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str()); } for( int i = 0; i < MAX_ARRS; i++ ) u[i] = 0; haveTempDstUMats = false; haveTempSrcUMats = false; } void cleanupUMats() { for( int i = 0; i < MAX_ARRS; i++ ) if( u[i] ) { if( CV_XADD(&u[i]->urefcount, -1) == 1 ) { u[i]->flags |= UMatData::ASYNC_CLEANUP; u[i]->currAllocator->deallocate(u[i]); } u[i] = 0; } nu = 0; haveTempDstUMats = false; haveTempSrcUMats = false; } void addUMat(const UMat& m, bool dst) { CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0); u[nu] = m.u; CV_XADD(&m.u->urefcount, 1); nu++; if(dst && m.u->tempUMat()) haveTempDstUMats = true; if(m.u->originalUMatData == NULL && m.u->tempUMat()) haveTempSrcUMats = true; // UMat is created on RAW memory (without proper lifetime management, even from Mat) } /// Preserve image lifetime (while it is specified as Kernel argument) void registerImageArgument(int arg, const Image2D& image) { CV_CheckGE(arg, 0, ""); if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr()) // TODO future: replace ptr => impl (more strong check) { CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed"); } shadow_images.reserve(MAX_ARRS); shadow_images.resize(std::max(shadow_images.size(), (size_t)arg + 1)); shadow_images[arg] = image; } void finit(cl_event e) { CV_UNUSED(e); cleanupUMats(); isInProgress = false; release(); } bool run(int dims, size_t _globalsize[], size_t _localsize[], bool sync, int64* timeNS, const Queue& q); ~Impl() { if(handle) { CV_OCL_DBG_CHECK(clReleaseKernel(handle)); } } IMPLEMENT_REFCOUNTABLE(); cv::String name; cl_kernel handle; enum { MAX_ARRS = 16 }; UMatData* u[MAX_ARRS]; bool isInProgress; bool isAsyncRun; // true if kernel was scheduled in async mode int nu; std::vector shadow_images; bool haveTempDstUMats; bool haveTempSrcUMats; }; }} // namespace cv::ocl extern "C" { static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p) { try { ((cv::ocl::Kernel::Impl*)p)->finit(e); } catch (const cv::Exception& exc) { CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what()); } catch (const std::exception& exc) { CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what()); } catch (...) { CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback"); } } } namespace cv { namespace ocl { Kernel::Kernel() CV_NOEXCEPT { p = 0; } Kernel::Kernel(const char* kname, const Program& prog) { p = 0; create(kname, prog); } Kernel::Kernel(const char* kname, const ProgramSource& src, const String& buildopts, String* errmsg) { p = 0; create(kname, src, buildopts, errmsg); } Kernel::Kernel(const Kernel& k) { p = k.p; if(p) p->addref(); } Kernel& Kernel::operator = (const Kernel& k) { Impl* newp = (Impl*)k.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Kernel::Kernel(Kernel&& k) CV_NOEXCEPT { p = k.p; k.p = nullptr; } Kernel& Kernel::operator = (Kernel&& k) CV_NOEXCEPT { if (this != &k) { if(p) p->release(); p = k.p; k.p = nullptr; } return *this; } Kernel::~Kernel() { if(p) p->release(); } bool Kernel::create(const char* kname, const Program& prog) { if(p) p->release(); p = new Impl(kname, prog); if(p->handle == 0) { p->release(); p = 0; } #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails CV_Assert(p); #endif return p != 0; } bool Kernel::create(const char* kname, const ProgramSource& src, const String& buildopts, String* errmsg) { if(p) { p->release(); p = 0; } String tempmsg; if( !errmsg ) errmsg = &tempmsg; const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg); return create(kname, prog); } void* Kernel::ptr() const { return p ? p->handle : 0; } bool Kernel::empty() const { return ptr() == 0; } static cv::String dumpValue(size_t sz, const void* p) { if (sz == 4) return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p); if (sz == 8) return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p); return cv::format("%p", p); } int Kernel::set(int i, const void* value, size_t sz) { if (!p || !p->handle) return -1; if (i < 0) return i; if( i == 0 ) p->cleanupUMats(); cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str()); if (retval != CL_SUCCESS) return -1; return i+1; } int Kernel::set(int i, const Image2D& image2D) { cl_mem h = (cl_mem)image2D.ptr(); int res = set(i, &h, sizeof(h)); if (res >= 0) p->registerImageArgument(i, image2D); return res; } int Kernel::set(int i, const UMat& m) { return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m)); } int Kernel::set(int i, const KernelArg& arg) { if( !p || !p->handle ) return -1; if (i < 0) { CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index", p->name.c_str(), (int)i)); return i; } if( i == 0 ) p->cleanupUMats(); cl_int status = 0; if( arg.m ) { AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast(0)) | ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast(0)); bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0; if (ptronly && arg.m->empty()) { cl_mem h_null = (cl_mem)NULL; status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str()); return i + 1; } cl_mem h = (cl_mem)arg.m->handle(accessFlags); if (!h) { CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)", p->name.c_str(), (int)i, (int)arg.flags, arg.m)); p->release(); p = 0; return -1; } #ifdef HAVE_OPENCL_SVM if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { const Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); uchar*& svmDataPtr = (uchar*&)arg.m->u->handle; CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr); #if 1 // TODO status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr); #else status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr); #endif CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str()); } else #endif { status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str()); } if (ptronly) { i++; } else if( arg.m->dims <= 2 ) { UMat2D u2d(*arg.m); status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str()); i += 3; if( !(arg.flags & KernelArg::NO_SIZE) ) { int cols = u2d.cols*arg.wscale/arg.iwscale; status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str()); i += 2; } } else { UMat3D u3d(*arg.m); status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str()); i += 4; if( !(arg.flags & KernelArg::NO_SIZE) ) { int cols = u3d.cols*arg.wscale/arg.iwscale; status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str()); status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str()); i += 3; } } p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE)); return i; } status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj); CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str()); return i+1; } bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], bool sync, const Queue& q) { if (!p) return false; size_t globalsize[CV_MAX_DIM] = {1,1,1}; size_t total = 1; CV_Assert(_globalsize != NULL); for (int i = 0; i < dims; i++) { size_t val = _localsize ? _localsize[i] : dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1; CV_Assert( val > 0 ); total *= _globalsize[i]; if (_globalsize[i] == 1 && !_localsize) val = 1; globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val; } CV_Assert(total > 0); return p->run(dims, globalsize, _localsize, sync, NULL, q); } static bool isRaiseErrorOnReuseAsyncKernel() { static bool initialized = false; static bool value = false; if (!initialized) { value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false); initialized = true; } return value; } bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[], bool sync, int64* timeNS, const Queue& q) { CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str()); if (!handle) { CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name); return false; } if (isAsyncRun) { CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name); if (isRaiseErrorOnReuseAsyncKernel()) CV_Assert(0); return false; // OpenCV 5.0: raise error } isAsyncRun = !sync; if (isInProgress) { CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name); if (isRaiseErrorOnReuseAsyncKernel()) CV_Assert(0); return false; // OpenCV 5.0: raise error } cl_command_queue qq = getQueue(q); if (haveTempDstUMats) sync = true; if (haveTempSrcUMats) sync = true; if (timeNS) sync = true; cl_event asyncEvent = 0; cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims, NULL, globalsize, localsize, 0, 0, (sync && !timeNS) ? 0 : &asyncEvent); #if !CV_OPENCL_SHOW_RUN_KERNELS if (retval != CL_SUCCESS) #endif { cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims, globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1), (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(), sync ? "true" : "false" ); if (retval != CL_SUCCESS) { msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str()); } #if CV_OPENCL_TRACE_CHECK CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str()); #else printf("%s\n", msg.c_str()); fflush(stdout); #endif } if (sync || retval != CL_SUCCESS) { CV_OCL_DBG_CHECK(clFinish(qq)); if (timeNS) { if (retval == CL_SUCCESS) { CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent)); cl_ulong startTime, stopTime; CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL)); CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL)); *timeNS = (int64)(stopTime - startTime); } else { *timeNS = -1; } } cleanupUMats(); } else { addref(); isInProgress = true; CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this)); } if (asyncEvent) CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent)); return retval == CL_SUCCESS; } bool Kernel::runTask(bool sync, const Queue& q) { if(!p || !p->handle || p->isInProgress) return false; cl_command_queue qq = getQueue(q); cl_event asyncEvent = 0; cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str()); if (sync || retval != CL_SUCCESS) { CV_OCL_DBG_CHECK(clFinish(qq)); p->cleanupUMats(); } else { p->addref(); p->isInProgress = true; CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p)); } if (asyncEvent) CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent)); return retval == CL_SUCCESS; } int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_) { CV_Assert(p && p->handle && !p->isInProgress); Queue q = q_.ptr() ? q_ : Queue::getDefault(); CV_Assert(q.ptr()); q.finish(); // call clFinish() on base queue Queue profilingQueue = q.getProfilingQueue(); int64 timeNs = -1; bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue); return res ? timeNs : -1; } size_t Kernel::workGroupSize() const { if(!p || !p->handle) return 0; size_t val = 0, retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz); CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)"); return status == CL_SUCCESS ? val : 0; } size_t Kernel::preferedWorkGroupSizeMultiple() const { if(!p || !p->handle) return 0; size_t val = 0, retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz); CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)"); return status == CL_SUCCESS ? val : 0; } bool Kernel::compileWorkGroupSize(size_t wsz[]) const { if(!p || !p->handle || !wsz) return 0; size_t retsz = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz); CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)"); return status == CL_SUCCESS; } size_t Kernel::localMemSize() const { if(!p || !p->handle) return 0; size_t retsz = 0; cl_ulong val = 0; cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz); CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)"); return status == CL_SUCCESS ? (size_t)val : 0; } ///////////////////////////////////////// ProgramSource /////////////////////////////////////////////// struct ProgramSource::Impl { IMPLEMENT_REFCOUNTABLE(); enum KIND { PROGRAM_SOURCE_CODE = 0, PROGRAM_BINARIES, PROGRAM_SPIR, PROGRAM_SPIRV } kind_; Impl(const String& src) { init(PROGRAM_SOURCE_CODE, cv::String(), cv::String()); initFromSource(src, cv::String()); } Impl(const String& module, const String& name, const String& codeStr, const String& codeHash) { init(PROGRAM_SOURCE_CODE, module, name); initFromSource(codeStr, codeHash); } /// reset fields void init(enum KIND kind, const String& module, const String& name) { refcount = 1; kind_ = kind; module_ = module; name_ = name; sourceAddr_ = NULL; sourceSize_ = 0; isHashUpdated = false; } void initFromSource(const String& codeStr, const String& codeHash) { codeStr_ = codeStr; sourceHash_ = codeHash; if (sourceHash_.empty()) { updateHash(); } else { isHashUpdated = true; } } void updateHash(const char* hashStr = NULL) { if (hashStr) { sourceHash_ = cv::String(hashStr); isHashUpdated = true; return; } uint64 hash = 0; switch (kind_) { case PROGRAM_SOURCE_CODE: if (sourceAddr_) { CV_Assert(codeStr_.empty()); hash = crc64(sourceAddr_, sourceSize_); // static storage } else { CV_Assert(!codeStr_.empty()); hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size()); } break; case PROGRAM_BINARIES: case PROGRAM_SPIR: case PROGRAM_SPIRV: hash = crc64(sourceAddr_, sourceSize_); break; default: CV_Error(Error::StsInternal, "Internal error"); } sourceHash_ = cv::format("%08jx", (uintmax_t)hash); isHashUpdated = true; } Impl(enum KIND kind, const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions = cv::String()) { init(kind, module, name); sourceAddr_ = binary; sourceSize_ = size; buildOptions_ = buildOptions; } static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name, const char* sourceCodeStaticStr, const char* hashStaticStr, const cv::String& buildOptions) { ProgramSource result; result.p = new Impl(PROGRAM_SOURCE_CODE, module, name, (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions); result.p->updateHash(hashStaticStr); return result; } static ProgramSource fromBinary(const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions) { ProgramSource result; result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions); return result; } static ProgramSource fromSPIR(const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions) { ProgramSource result; result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions); return result; } String module_; String name_; // TODO std::vector includes_; String codeStr_; // PROGRAM_SOURCE_CODE only const unsigned char* sourceAddr_; size_t sourceSize_; cv::String buildOptions_; String sourceHash_; bool isHashUpdated; friend struct Program::Impl; friend struct internal::ProgramEntry; friend struct Context::Impl; }; ProgramSource::ProgramSource() CV_NOEXCEPT { p = 0; } ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash) { p = new Impl(module, name, codeStr, codeHash); } ProgramSource::ProgramSource(const char* prog) { p = new Impl(prog); } ProgramSource::ProgramSource(const String& prog) { p = new Impl(prog); } ProgramSource::~ProgramSource() { if(p) p->release(); } ProgramSource::ProgramSource(const ProgramSource& prog) { p = prog.p; if(p) p->addref(); } ProgramSource& ProgramSource::operator = (const ProgramSource& prog) { Impl* newp = (Impl*)prog.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } ProgramSource::ProgramSource(ProgramSource&& prog) CV_NOEXCEPT { p = prog.p; prog.p = nullptr; } ProgramSource& ProgramSource::operator = (ProgramSource&& prog) CV_NOEXCEPT { if (this != &prog) { if(p) p->release(); p = prog.p; prog.p = nullptr; } return *this; } const String& ProgramSource::source() const { CV_Assert(p); CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE); CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object return p->codeStr_; } ProgramSource::hash_t ProgramSource::hash() const { CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()"); } ProgramSource ProgramSource::fromBinary(const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions) { CV_Assert(binary); CV_Assert(size > 0); return Impl::fromBinary(module, name, binary, size, buildOptions); } ProgramSource ProgramSource::fromSPIR(const String& module, const String& name, const unsigned char* binary, const size_t size, const cv::String& buildOptions) { CV_Assert(binary); CV_Assert(size > 0); return Impl::fromBinary(module, name, binary, size, buildOptions); } internal::ProgramEntry::operator ProgramSource&() const { if (this->pProgramSource == NULL) { cv::AutoLock lock(cv::getInitializationMutex()); if (this->pProgramSource == NULL) { ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String()); ProgramSource* ptr = new ProgramSource(ps); const_cast(this)->pProgramSource = ptr; } } return *this->pProgramSource; } /////////////////////////////////////////// Program ///////////////////////////////////////////// static cv::String joinBuildOptions(const cv::String& a, const cv::String& b) { if (b.empty()) return a; if (a.empty()) return b; if (b[0] == ' ') return a + b; return a + (cv::String(" ") + b); } struct Program::Impl { IMPLEMENT_REFCOUNTABLE(); Impl(const ProgramSource& src, const String& _buildflags, String& errmsg) : refcount(1), handle(NULL), buildflags(_buildflags) { const ProgramSource::Impl* src_ = src.getImpl(); CV_Assert(src_); sourceModule_ = src_->module_; sourceName_ = src_->name_; const Context ctx = Context::getDefault(); Device device = ctx.device(0); if (ctx.ptr() == NULL || device.ptr() == NULL) return; buildflags = joinBuildOptions(buildflags, src_->buildOptions_); if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE) { if (device.isAMD()) buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE"); else if (device.isIntel()) buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE"); const String param_buildExtraOptions = getBuildExtraOptions(); if (!param_buildExtraOptions.empty()) buildflags = joinBuildOptions(buildflags, param_buildExtraOptions); } compile(ctx, src_, errmsg); } bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg) { CV_Assert(ctx.getImpl()); CV_Assert(src_); // We don't cache OpenCL binaries if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES) { CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str()); bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg); return isLoaded; } return compileWithCache(ctx, src_, errmsg); } bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg) { CV_Assert(ctx.getImpl()); CV_Assert(src_); CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES); #if OPENCV_HAVE_FILESYSTEM_SUPPORT OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance(); const std::string base_dir = config.prepareCacheDirectoryForContext( ctx.getImpl()->getPrefixString(), ctx.getImpl()->getPrefixBase() ); const String& hash_str = src_->sourceHash_; cv::String fname; if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty()) { CV_Assert(!hash_str.empty()); fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin"; fname = utils::fs::join(base_dir, fname); } const cv::Ptr fileLock = config.cache_lock_; // can be empty if (!fname.empty() && CV_OPENCL_CACHE_ENABLE) { try { std::vector binaryBuf; bool res = false; { cv::utils::optional_shared_lock_guard lock_fs(fileLock.get()); BinaryProgramFile file(fname, hash_str.c_str()); res = file.read(buildflags, binaryBuf); } if (res) { CV_Assert(!binaryBuf.empty()); CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str()); bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg); if (isLoaded) return true; } } catch (const cv::Exception& e) { CV_UNUSED(e); CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what()); } catch (...) { CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname); } } #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT CV_Assert(handle == NULL); if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE) { if (!buildFromSources(ctx, src_, errmsg)) { return false; } } else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR) { buildflags = joinBuildOptions(buildflags, " -x spir"); if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos) { buildflags = joinBuildOptions(buildflags, " -spir-std=1.2"); } CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str()); bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg); if (!isLoaded) return false; } else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV) { CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported"); } else { CV_Error(Error::StsInternal, "Internal error"); } CV_Assert(handle != NULL); #if OPENCV_HAVE_FILESYSTEM_SUPPORT if (!fname.empty() && CV_OPENCL_CACHE_WRITE) { try { std::vector binaryBuf; getProgramBinary(binaryBuf); { cv::utils::optional_lock_guard lock_fs(fileLock.get()); BinaryProgramFile file(fname, hash_str.c_str()); file.write(buildflags, binaryBuf); } } catch (const cv::Exception& e) { CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what()); } catch (...) { CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname); } } #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE) { std::vector binaryBuf; getProgramBinary(binaryBuf); if (!binaryBuf.empty()) { CV_OCL_DBG_CHECK(clReleaseProgram(handle)); handle = NULL; createFromBinary(ctx, binaryBuf, errmsg); } } #endif return handle != NULL; } void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg) { AutoBuffer buffer; buffer[0] = 0; size_t retsz = 0; cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0], CL_PROGRAM_BUILD_LOG, 0, 0, &retsz); if (log_retval == CL_SUCCESS && retsz > 1) { buffer.resize(retsz + 16); log_retval = clGetProgramBuildInfo(handle, deviceList[0], CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz); if (log_retval == CL_SUCCESS) { if (retsz < buffer.size()) buffer[retsz] = 0; else buffer[buffer.size() - 1] = 0; } else { buffer[0] = 0; } } errmsg = String(buffer.data()); printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n", sourceModule_.c_str(), sourceName_.c_str(), result, getOpenCLErrorString(result), buildflags.c_str(), errmsg.c_str()); fflush(stdout); } bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg) { CV_Assert(src_); CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE); CV_Assert(handle == NULL); CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s", sourceModule_.c_str(), sourceName_.c_str(), src_->sourceHash_.c_str(), buildflags.c_str()).c_str()); CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str()); const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str(); size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size(); CV_Assert(srcptr != NULL); CV_Assert(srclen > 0); cl_int retval = 0; handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval); CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource"); CV_Assert(handle || retval != CL_SUCCESS); if (handle && retval == CL_SUCCESS) { size_t n = ctx.ndevices(); AutoBuffer deviceListBuf(n + 1); cl_device_id* deviceList = deviceListBuf.data(); for (size_t i = 0; i < n; i++) { deviceList[i] = (cl_device_id)(ctx.device(i).ptr()); } retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0); CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str()); #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG if (retval != CL_SUCCESS) #endif { dumpBuildLog_(retval, deviceList, errmsg); // don't remove "retval != CL_SUCCESS" condition here: // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode if (retval != CL_SUCCESS && handle) { CV_OCL_DBG_CHECK(clReleaseProgram(handle)); handle = NULL; } } #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE) { CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)..."); size_t retsz = 0; char kernels_buffer[4096] = {0}; cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz); if (retsz < sizeof(kernels_buffer)) kernels_buffer[retsz] = 0; else kernels_buffer[0] = 0; CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'"); } #endif } return handle != NULL; } void getProgramBinary(std::vector& buf) { CV_Assert(handle); size_t sz = 0; CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL)); buf.resize(sz); uchar* ptr = (uchar*)&buf[0]; CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL)); } bool createFromBinary(const Context& ctx, const std::vector& buf, String& errmsg) { return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg); } bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg) { CV_Assert(handle == NULL); CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program"); CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)"); CV_Assert(binarySize > 0); size_t ndevices = (int)ctx.ndevices(); AutoBuffer devices_(ndevices); AutoBuffer binaryPtrs_(ndevices); AutoBuffer binarySizes_(ndevices); cl_device_id* devices = devices_.data(); const uchar** binaryPtrs = binaryPtrs_.data(); size_t* binarySizes = binarySizes_.data(); for (size_t i = 0; i < ndevices; i++) { devices[i] = (cl_device_id)ctx.device(i).ptr(); binaryPtrs[i] = binaryAddr; binarySizes[i] = binarySize; } cl_int result = 0; handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(), binarySizes, binaryPtrs, NULL, &result); if (result != CL_SUCCESS) { CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary")); if (handle) { CV_OCL_DBG_CHECK(clReleaseProgram(handle)); handle = NULL; } } if (!handle) { return false; } // call clBuildProgram() { result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0); CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str()); if (result != CL_SUCCESS) { dumpBuildLog_(result, devices, errmsg); if (handle) { CV_OCL_DBG_CHECK(clReleaseProgram(handle)); handle = NULL; } return false; } } // check build status { cl_build_status build_status = CL_BUILD_NONE; size_t retsz = 0; CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status, &retsz)); if (result == CL_SUCCESS) { if (build_status == CL_BUILD_SUCCESS) { return true; } else { CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status); return false; } } else { CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()")); if (handle) { CV_OCL_DBG_CHECK(clReleaseProgram(handle)); handle = NULL; } } } #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE) { CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)..."); size_t retsz = 0; char kernels_buffer[4096] = {0}; result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz); if (retsz < sizeof(kernels_buffer)) kernels_buffer[retsz] = 0; else kernels_buffer[0] = 0; CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'"); } #endif return handle != NULL; } ~Impl() { if( handle ) { #ifdef _WIN32 if (!cv::__termination) #endif { clReleaseProgram(handle); } handle = NULL; } } cl_program handle; String buildflags; String sourceModule_; String sourceName_; }; Program::Program() CV_NOEXCEPT { p = 0; } Program::Program(const ProgramSource& src, const String& buildflags, String& errmsg) { p = 0; create(src, buildflags, errmsg); } Program::Program(const Program& prog) { p = prog.p; if(p) p->addref(); } Program& Program::operator = (const Program& prog) { Impl* newp = (Impl*)prog.p; if(newp) newp->addref(); if(p) p->release(); p = newp; return *this; } Program::Program(Program&& prog) CV_NOEXCEPT { p = prog.p; prog.p = nullptr; } Program& Program::operator = (Program&& prog) CV_NOEXCEPT { if (this != &prog) { if(p) p->release(); p = prog.p; prog.p = nullptr; } return *this; } Program::~Program() { if(p) p->release(); } bool Program::create(const ProgramSource& src, const String& buildflags, String& errmsg) { if(p) { p->release(); p = NULL; } p = new Impl(src, buildflags, errmsg); if(!p->handle) { p->release(); p = 0; } return p != 0; } void* Program::ptr() const { return p ? p->handle : 0; } #ifndef OPENCV_REMOVE_DEPRECATED_API const ProgramSource& Program::source() const { CV_Error(Error::StsNotImplemented, "Removed API"); } bool Program::read(const String& bin, const String& buildflags) { CV_UNUSED(bin); CV_UNUSED(buildflags); CV_Error(Error::StsNotImplemented, "Removed API"); } bool Program::write(String& bin) const { CV_UNUSED(bin); CV_Error(Error::StsNotImplemented, "Removed API"); } String Program::getPrefix() const { if(!p) return String(); Context::Impl* ctx_ = Context::getDefault().getImpl(); CV_Assert(ctx_); return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str()); } String Program::getPrefix(const String& buildflags) { Context::Impl* ctx_ = Context::getDefault().getImpl(); CV_Assert(ctx_); return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str()); } #endif // OPENCV_REMOVE_DEPRECATED_API void Program::getBinary(std::vector& binary) const { CV_Assert(p && "Empty program"); p->getProgramBinary(binary); } Program Context::Impl::getProg(const ProgramSource& src, const String& buildflags, String& errmsg) { size_t limit = getProgramCountLimit(); const ProgramSource::Impl* src_ = src.getImpl(); CV_Assert(src_); String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s", src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(), getPrefixString().c_str(), buildflags.c_str()); { cv::AutoLock lock(program_cache_mutex); phash_t::iterator it = phash.find(key); if (it != phash.end()) { // TODO LRU cache CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key); if (i != cacheList.end() && i != cacheList.begin()) { cacheList.erase(i); cacheList.push_front(key); } return it->second; } { // cleanup program cache size_t sz = phash.size(); if (limit > 0 && sz >= limit) { static bool warningFlag = false; if (!warningFlag) { printf("\nWARNING: OpenCV-OpenCL:\n" " In-memory cache for OpenCL programs is full, older programs will be unloaded.\n" " You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n"); warningFlag = true; } while (!cacheList.empty()) { size_t c = phash.erase(cacheList.back()); cacheList.pop_back(); if (c != 0) break; } } } } Program prog(src, buildflags, errmsg); // Cache result of build failures too (to prevent unnecessary compiler invocations) { cv::AutoLock lock(program_cache_mutex); phash.insert(std::pair(key, prog)); cacheList.push_front(key); } return prog; } //////////////////////////////////////////// OpenCLAllocator ////////////////////////////////////////////////// template class OpenCLBufferPool { protected: ~OpenCLBufferPool() { } public: virtual T allocate(size_t size) = 0; virtual void release(T buffer) = 0; }; template class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool { private: inline Derived& derived() { return *static_cast(this); } protected: Mutex mutex_; size_t currentReservedSize; size_t maxReservedSize; std::list allocatedEntries_; // Allocated and used entries std::list reservedEntries_; // LRU order. Allocated, but not used entries // synchronized bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer) { typename std::list::iterator i = allocatedEntries_.begin(); for (; i != allocatedEntries_.end(); ++i) { BufferEntry& e = *i; if (e.clBuffer_ == buffer) { entry = e; allocatedEntries_.erase(i); return true; } } return false; } // synchronized bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size) { if (reservedEntries_.empty()) return false; typename std::list::iterator i = reservedEntries_.begin(); typename std::list::iterator result_pos = reservedEntries_.end(); BufferEntry result; size_t minDiff = (size_t)(-1); for (; i != reservedEntries_.end(); ++i) { BufferEntry& e = *i; if (e.capacity_ >= size) { size_t diff = e.capacity_ - size; if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff)) { minDiff = diff; result_pos = i; result = e; if (diff == 0) break; } } } if (result_pos != reservedEntries_.end()) { //CV_DbgAssert(result == *result_pos); reservedEntries_.erase(result_pos); entry = result; currentReservedSize -= entry.capacity_; allocatedEntries_.push_back(entry); return true; } return false; } // synchronized void _checkSizeOfReservedEntries() { while (currentReservedSize > maxReservedSize) { CV_DbgAssert(!reservedEntries_.empty()); const BufferEntry& entry = reservedEntries_.back(); CV_DbgAssert(currentReservedSize >= entry.capacity_); currentReservedSize -= entry.capacity_; derived()._releaseBufferEntry(entry); reservedEntries_.pop_back(); } } inline size_t _allocationGranularity(size_t size) { // heuristic values if (size < 1024*1024) return 4096; // don't work with buffers smaller than 4Kb (hidden allocation overhead issue) else if (size < 16*1024*1024) return 64*1024; else return 1024*1024; } public: OpenCLBufferPoolBaseImpl() : currentReservedSize(0), maxReservedSize(0) { // nothing } virtual ~OpenCLBufferPoolBaseImpl() { freeAllReservedBuffers(); CV_Assert(reservedEntries_.empty()); } public: virtual T allocate(size_t size) CV_OVERRIDE { AutoLock locker(mutex_); BufferEntry entry; if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size)) { CV_DbgAssert(size <= entry.capacity_); LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_); } else { derived()._allocateBufferEntry(entry, size); } return entry.clBuffer_; } virtual void release(T buffer) CV_OVERRIDE { AutoLock locker(mutex_); BufferEntry entry; CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer)); if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8) { derived()._releaseBufferEntry(entry); } else { reservedEntries_.push_front(entry); currentReservedSize += entry.capacity_; _checkSizeOfReservedEntries(); } } virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; } virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; } virtual void setMaxReservedSize(size_t size) CV_OVERRIDE { AutoLock locker(mutex_); size_t oldMaxReservedSize = maxReservedSize; maxReservedSize = size; if (maxReservedSize < oldMaxReservedSize) { typename std::list::iterator i = reservedEntries_.begin(); for (; i != reservedEntries_.end();) { const BufferEntry& entry = *i; if (entry.capacity_ > maxReservedSize / 8) { CV_DbgAssert(currentReservedSize >= entry.capacity_); currentReservedSize -= entry.capacity_; derived()._releaseBufferEntry(entry); i = reservedEntries_.erase(i); continue; } ++i; } _checkSizeOfReservedEntries(); } } virtual void freeAllReservedBuffers() CV_OVERRIDE { AutoLock locker(mutex_); typename std::list::const_iterator i = reservedEntries_.begin(); for (; i != reservedEntries_.end(); ++i) { const BufferEntry& entry = *i; derived()._releaseBufferEntry(entry); } reservedEntries_.clear(); currentReservedSize = 0; } }; struct CLBufferEntry { cl_mem clBuffer_; size_t capacity_; CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { } }; class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl { public: typedef struct CLBufferEntry BufferEntry; protected: int createFlags_; public: OpenCLBufferPoolImpl(int createFlags = 0) : createFlags_(createFlags) { } void _allocateBufferEntry(BufferEntry& entry, size_t size) { CV_DbgAssert(entry.clBuffer_ == NULL); entry.capacity_ = alignSize(size, (int)_allocationGranularity(size)); Context& ctx = Context::getDefault(); cl_int retval = CL_SUCCESS; entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval); CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str()); CV_Assert(entry.clBuffer_ != NULL); if(retval == CL_SUCCESS) { CV_IMPL_ADD(CV_IMPL_OCL); } LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n", (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_); allocatedEntries_.push_back(entry); } void _releaseBufferEntry(const BufferEntry& entry) { CV_Assert(entry.capacity_ != 0); CV_Assert(entry.clBuffer_ != NULL); LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n", entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_); CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_)); } }; #ifdef HAVE_OPENCL_SVM struct CLSVMBufferEntry { void* clBuffer_; size_t capacity_; CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { } }; class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl { public: typedef struct CLSVMBufferEntry BufferEntry; public: OpenCLSVMBufferPoolImpl() { } void _allocateBufferEntry(BufferEntry& entry, size_t size) { CV_DbgAssert(entry.clBuffer_ == NULL); entry.capacity_ = alignSize(size, (int)_allocationGranularity(size)); Context& ctx = Context::getDefault(); const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE | (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_); void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0); CV_Assert(buf); entry.clBuffer_ = buf; { CV_IMPL_ADD(CV_IMPL_OCL); } LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n", (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_); allocatedEntries_.push_back(entry); } void _releaseBufferEntry(const BufferEntry& entry) { CV_Assert(entry.capacity_ != 0); CV_Assert(entry.clBuffer_ != NULL); LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n", entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_); Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_); svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_); } }; #endif template class AlignedDataPtr { protected: const size_t size_; uchar* const originPtr_; const size_t alignment_; uchar* ptr_; uchar* allocatedPtr_; public: AlignedDataPtr(uchar* ptr, size_t size, size_t alignment) : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL) { CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n CV_DbgAssert(!readAccess || ptr); if (((size_t)ptr_ & (alignment - 1)) != 0) { allocatedPtr_ = new uchar[size_ + alignment - 1]; ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1)); if (readAccess) { memcpy(ptr_, originPtr_, size_); } } } uchar* getAlignedPtr() const { CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0); return ptr_; } ~AlignedDataPtr() { if (allocatedPtr_) { if (writeAccess) { memcpy(originPtr_, ptr_, size_); } delete[] allocatedPtr_; allocatedPtr_ = NULL; } ptr_ = NULL; } private: AlignedDataPtr(const AlignedDataPtr&); // disabled AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled }; template class AlignedDataPtr2D { protected: const size_t size_; uchar* const originPtr_; const size_t alignment_; uchar* ptr_; uchar* allocatedPtr_; size_t rows_; size_t cols_; size_t step_; public: AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0) : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step) { CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n CV_DbgAssert(!readAccess || ptr != NULL); if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0) { allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1]; ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1)); if (readAccess) { for (size_t i = 0; i < rows_; i++) memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_); } } } uchar* getAlignedPtr() const { CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0); return ptr_; } ~AlignedDataPtr2D() { if (allocatedPtr_) { if (writeAccess) { for (size_t i = 0; i < rows_; i++) memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_); } delete[] allocatedPtr_; allocatedPtr_ = NULL; } ptr_ = NULL; } private: AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled }; #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT #define CV_OPENCL_DATA_PTR_ALIGNMENT 16 #endif void Context::Impl::__init_buffer_pools() { bufferPool_ = std::make_shared(0); OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get(); bufferPoolHostPtr_ = std::make_shared(CL_MEM_ALLOC_HOST_PTR); OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get(); size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0; size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize); bufferPool.setMaxReservedSize(poolSize); size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize); bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr); #ifdef HAVE_OPENCL_SVM bufferPoolSVM_ = std::make_shared(); OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get(); size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize); bufferPoolSVM.setMaxReservedSize(poolSizeSVM); #endif CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr); } class OpenCLAllocator CV_FINAL : public MatAllocator { public: enum AllocatorFlags { ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0, ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1, #ifdef HAVE_OPENCL_SVM ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2, #endif ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3 // convertFromBuffer() }; OpenCLAllocator() { matStdAllocator = Mat::getDefaultAllocator(); } ~OpenCLAllocator() { flushCleanupQueue(); } UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const { UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags); return u; } static bool isOpenCLMapForced() // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API { static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false); return value; } static bool isOpenCLCopyingForced() // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API { static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false); return value; } void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const { const Device& dev = ctx.device(0); createFlags = 0; if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0) createFlags |= CL_MEM_ALLOC_HOST_PTR; if (!isOpenCLCopyingForced() && (isOpenCLMapForced() || (dev.hostUnifiedMemory() #ifndef __APPLE__ || dev.isIntel() #endif ) ) ) flags0 = static_cast(0); else flags0 = UMatData::COPY_ON_MAP; } UMatData* allocate(int dims, const int* sizes, int type, void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE { if(!useOpenCL()) return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags); flushCleanupQueue(); CV_Assert(data == 0); size_t total = CV_ELEM_SIZE(type); for( int i = dims-1; i >= 0; i-- ) { if( step ) step[i] = total; total *= sizes[i]; } Context& ctx = Context::getDefault(); if (!ctx.getImpl()) return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags); Context::Impl& ctxImpl = *ctx.getImpl(); int createFlags = 0; UMatData::MemoryFlag flags0 = static_cast(0); getBestFlags(ctx, flags, usageFlags, createFlags, flags0); void* handle = NULL; int allocatorFlags = 0; #ifdef HAVE_OPENCL_SVM const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport()) { allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED; handle = ctxImpl.getBufferPoolSVM().allocate(total); // this property is constant, so single buffer pool can be used here bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER; } else #endif if (createFlags == 0) { allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED; handle = ctxImpl.getBufferPool().allocate(total); } else if (createFlags == CL_MEM_ALLOC_HOST_PTR) { allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED; handle = ctxImpl.getBufferPoolHostPtr().allocate(total); } else { CV_Assert(handle != NULL); // Unsupported, throw } if (!handle) return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags); UMatData* u = new UMatData(this); u->data = 0; u->size = total; u->handle = handle; u->flags = flags0; u->allocatorFlags_ = allocatorFlags; u->allocatorContext = std::static_pointer_cast(std::make_shared(ctx)); CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate() u->markHostCopyObsolete(true); opencl_allocator_stats.onAllocate(u->size); return u; } bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE { if(!u) return false; flushCleanupQueue(); UMatDataAutoLock lock(u); if(u->handle == 0) { CV_Assert(u->origdata != 0); Context& ctx = Context::getDefault(); int createFlags = 0; UMatData::MemoryFlag flags0 = static_cast(0); getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0); bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0; cl_context ctx_handle = (cl_context)ctx.ptr(); int allocatorFlags = 0; UMatData::MemoryFlag tempUMatFlags = static_cast(0); void* handle = NULL; cl_int retval = CL_SUCCESS; #ifdef HAVE_OPENCL_SVM svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags); if (useSVM && svmCaps.isSupportFineGrainSystem()) { allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM; tempUMatFlags = UMatData::TEMP_UMAT; handle = u->origdata; CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle); } else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer())) { if (!(accessFlags & ACCESS_FAST)) // memcpy used { bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); cl_svm_mem_flags memFlags = createFlags | (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size); handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0); CV_Assert(handle); cl_command_queue q = NULL; if (!isFineGrainBuffer) { q = (cl_command_queue)Queue::getDefault().ptr(); CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size); cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, handle, u->size, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()"); } memcpy(handle, u->origdata, u->size); if (!isFineGrainBuffer) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); } tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT; allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER; } } else #endif { if( copyOnMap ) accessFlags &= ~ACCESS_FAST; tempUMatFlags = UMatData::TEMP_UMAT; if ( #ifdef __APPLE__ !copyOnMap && #endif CV_OPENCL_ENABLE_MEM_USE_HOST_PTR // There are OpenCL runtime issues for less aligned data && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0 && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR)) // Avoid sharing of host memory between OpenCL buffers && !(u->originalUMatData && u->originalUMatData->handle) ) { // Change the host-side origdata[size] to "pinned memory" that enables fast // DMA-transfers over PCIe to the device. Often used with clEnqueueMapBuffer/clEnqueueUnmapMemObject handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|(createFlags & ~CL_MEM_ALLOC_HOST_PTR), u->size, u->origdata, &retval); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|(createFlags & ~CL_MEM_ALLOC_HOST_PTR), sz=%lld, origdata=%p) => %p", (long long int)u->size, u->origdata, (void*)handle).c_str()); } if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST)) { // Allocate device-side memory and immediately copy data from the host-side pointer origdata[size]. // If createFlags=CL_MEM_ALLOC_HOST_PTR (aka cv::USAGE_ALLOCATE_HOST_MEMORY), then // additionally allocate a host-side "pinned" duplicate of the origdata that is // managed by OpenCL. This is potentially faster in unaligned/unmanaged scenarios. handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, u->size, u->origdata, &retval); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p", (long long int)u->size, u->origdata, (void*)handle).c_str()); tempUMatFlags |= UMatData::TEMP_COPIED_UMAT; } } CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str()); if(!handle || retval != CL_SUCCESS) return false; u->handle = handle; u->prevAllocator = u->currAllocator; u->currAllocator = this; u->flags |= tempUMatFlags | flags0; u->allocatorFlags_ = allocatorFlags; } if (!!(accessFlags & ACCESS_WRITE)) u->markHostCopyObsolete(true); opencl_allocator_stats.onAllocate(u->size); return true; } /*void sync(UMatData* u) const { cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); UMatDataAutoLock lock(u); if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata) { if( u->tempCopiedUMat() ) { clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->origdata, 0, 0, 0); } else { cl_int retval = 0; void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0); clFinish(q); } u->markHostCopyObsolete(false); } else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data ) { clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, u->data, 0, 0, 0); } }*/ void deallocate(UMatData* u) const CV_OVERRIDE { if(!u) return; CV_Assert(u->urefcount == 0); CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive"); CV_Assert(u->handle != 0); CV_Assert(u->mapcount == 0); if (!!(u->flags & UMatData::ASYNC_CLEANUP)) addToCleanupQueue(u); else deallocate_(u); } void deallocate_(UMatData* u) const { CV_Assert(u); CV_Assert(u->handle); if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0) { opencl_allocator_stats.onFree(u->size); } #ifdef _WIN32 if (cv::__termination) // process is not in consistent state (after ExitProcess call) and terminating return; // avoid any OpenCL calls #endif if(u->tempUMat()) { CV_Assert(u->origdata); // UMatDataAutoLock lock(u); if (u->hostCopyObsolete()) { #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); if( u->tempCopiedUMat() ) { CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER); bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER; cl_command_queue q = NULL; if (!isFineGrainBuffer) { CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)); q = (cl_command_queue)Queue::getDefault().ptr(); CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ, u->handle, u->size, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()"); } clFinish(q); memcpy(u->origdata, u->handle, u->size); if (!isFineGrainBuffer) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); } } else { CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM); // nothing } } else #endif { cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if( u->tempCopiedUMat() ) { AlignedDataPtr alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)); } else { cl_int retval = 0; if (u->tempUMat()) { CV_Assert(u->mapcount == 0); flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960) void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str()); CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293"); if (u->originalUMatData) { CV_Assert(u->originalUMatData->data == data); } retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str()); CV_OCL_DBG_CHECK(clFinish(q)); } } } u->markHostCopyObsolete(false); } else { // nothing } #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { if( u->tempCopiedUMat() ) { Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle); svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle); } } else #endif { cl_int retval = clReleaseMemObject((cl_mem)u->handle); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str()); } u->handle = 0; u->markDeviceCopyObsolete(true); u->currAllocator = u->prevAllocator; u->prevAllocator = NULL; if(u->data && u->copyOnMap() && u->data != u->origdata) fastFree(u->data); u->data = u->origdata; u->currAllocator->deallocate(u); u = NULL; } else { CV_Assert(u->origdata == NULL); if(u->data && u->copyOnMap() && u->data != u->origdata) { fastFree(u->data); u->data = 0; u->markHostCopyObsolete(true); } if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED) { std::shared_ptr pCtx = std::static_pointer_cast(u->allocatorContext); CV_Assert(pCtx); ocl::Context& ctx = *pCtx.get(); CV_Assert(ctx.getImpl()); ctx.getImpl()->getBufferPool().release((cl_mem)u->handle); } else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED) { std::shared_ptr pCtx = std::static_pointer_cast(u->allocatorContext); CV_Assert(pCtx); ocl::Context& ctx = *pCtx.get(); CV_Assert(ctx.getImpl()); ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle); } #ifdef HAVE_OPENCL_SVM else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED) { std::shared_ptr pCtx = std::static_pointer_cast(u->allocatorContext); CV_Assert(pCtx); ocl::Context& ctx = *pCtx.get(); if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) { //nothing } else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); } } CV_Assert(ctx.getImpl()); ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle); } #endif else { CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle)); } u->handle = 0; u->markDeviceCopyObsolete(true); delete u; u = NULL; } CV_Assert(u == NULL); } // synchronized call (external UMatDataAutoLock, see UMat::getMat) void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE { CV_Assert(u && u->handle); if (!!(accessFlags & ACCESS_WRITE)) u->markDeviceCopyObsolete(true); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); { if( !u->copyOnMap() ) { // TODO // because there can be other map requests for the same UMat with different access flags, // we use the universal (read-write) access mode. #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, u->handle, u->size, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()"); u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP; } } clFinish(q); u->data = (uchar*)u->handle; u->markHostCopyObsolete(false); u->markDeviceMemMapped(true); return; } #endif cl_int retval = CL_SUCCESS; if (!u->deviceMemMapped()) { CV_Assert(u->refcount == 1); CV_Assert(u->mapcount++ == 0); u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, (CL_MAP_READ | CL_MAP_WRITE), 0, u->size, 0, 0, 0, &retval); CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str()); } if (u->data && retval == CL_SUCCESS) { u->markHostCopyObsolete(false); u->markDeviceMemMapped(true); return; } // TODO Is it really a good idea and was it tested well? // if map failed, switch to copy-on-map mode for the particular buffer u->flags |= UMatData::COPY_ON_MAP; } if(!u->data) { u->data = (uchar*)fastMalloc(u->size); u->markHostCopyObsolete(true); } } if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete()) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); #ifdef HAVE_OPENCL_SVM CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0); #endif cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)", (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str()); u->markHostCopyObsolete(false); } } void unmap(UMatData* u) const CV_OVERRIDE { if(!u) return; CV_Assert(u->handle != 0); UMatDataAutoLock autolock(u); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); cl_int retval = 0; if( !u->copyOnMap() && u->deviceMemMapped() ) { CV_Assert(u->data != NULL); #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0); { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); clFinish(q); u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP; } } if (u->refcount == 0) u->data = 0; u->markDeviceCopyObsolete(false); u->markHostCopyObsolete(true); return; } #endif if (u->refcount == 0) { CV_Assert(u->mapcount-- == 1); retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str()); if (Device::getDefault().isAMD()) { // required for multithreaded applications (see stitching test) CV_OCL_DBG_CHECK(clFinish(q)); } u->markDeviceMemMapped(false); u->data = 0; u->markDeviceCopyObsolete(false); u->markHostCopyObsolete(true); } } else if( u->copyOnMap() && u->deviceCopyObsolete() ) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); #ifdef HAVE_OPENCL_SVM CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0); #endif retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)", (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str()); u->markDeviceCopyObsolete(false); u->markHostCopyObsolete(true); } } bool checkContinuous(int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dstofs[], const size_t dststep[], size_t& total, size_t new_sz[], size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[], size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const { bool iscontinuous = true; srcrawofs = srcofs ? srcofs[dims-1] : 0; dstrawofs = dstofs ? dstofs[dims-1] : 0; total = sz[dims-1]; for( int i = dims-2; i >= 0; i-- ) { if( i >= 0 && (total != srcstep[i] || total != dststep[i]) ) iscontinuous = false; total *= sz[i]; if( srcofs ) srcrawofs += srcofs[i]*srcstep[i]; if( dstofs ) dstrawofs += dstofs[i]*dststep[i]; } if( !iscontinuous ) { // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order. if( dims == 2 ) { new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1; // we assume that new_... arrays are initialized by caller // with 0's, so there is no else branch if( srcofs ) { new_srcofs[0] = srcofs[1]; new_srcofs[1] = srcofs[0]; new_srcofs[2] = 0; } if( dstofs ) { new_dstofs[0] = dstofs[1]; new_dstofs[1] = dstofs[0]; new_dstofs[2] = 0; } new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0; new_dststep[0] = dststep[0]; new_dststep[1] = 0; } else { // we could check for dims == 3 here, // but from user perspective this one is more informative CV_Assert(dims <= 3); new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0]; if( srcofs ) { new_srcofs[0] = srcofs[2]; new_srcofs[1] = srcofs[1]; new_srcofs[2] = srcofs[0]; } if( dstofs ) { new_dstofs[0] = dstofs[2]; new_dstofs[1] = dstofs[1]; new_dstofs[2] = dstofs[0]; } new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0]; new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0]; } } return iscontinuous; } void download(UMatData* u, void* dstptr, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dststep[]) const CV_OVERRIDE { if(!u) return; UMatDataAutoLock autolock(u); if( u->data && !u->hostCopyObsolete() ) { Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep); return; } CV_Assert( u->handle != 0 ); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { CV_DbgAssert(u->data == NULL || u->data == u->handle); Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0); if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ, u->handle, u->size, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()"); } clFinish(q); if( iscontinuous ) { memcpy(dstptr, (uchar*)u->handle + srcrawofs, total); } else { // This code is from MatAllocator::download() int isz[CV_MAX_DIM]; uchar* srcptr = (uchar*)u->handle; for( int i = 0; i < dims; i++ ) { CV_Assert( sz[i] <= (size_t)INT_MAX ); if( sz[i] == 0 ) return; if( srcofs ) srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); isz[i] = (int)sz[i]; } Mat src(dims, isz, CV_8U, srcptr, srcstep); Mat dst(dims, isz, CV_8U, dstptr, dststep); const Mat* arrays[] = { &src, &dst }; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs, 2); size_t j, planesz = it.size; for( j = 0; j < it.nplanes; j++, ++it ) memcpy(ptrs[1], ptrs[0], planesz); } if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); clFinish(q); } } else #endif { if( iscontinuous ) { AlignedDataPtr alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT); CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0)); } else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS) { const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT; size_t new_srcrawofs = srcrawofs & ~(padding-1); size_t membuf_ofs = srcrawofs - new_srcrawofs; AlignedDataPtr2D alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2); uchar* ptr = alignedPtr.getAlignedPtr(); CV_Assert(new_srcstep[0] >= new_sz[0]); total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding); total = std::min(total, u->size - new_srcrawofs); CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, new_srcrawofs, total, ptr, 0, 0, 0)); for( size_t i = 0; i < new_sz[1]; i++ ) memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]); } else { AlignedDataPtr2D alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT); uchar* ptr = alignedPtr.getAlignedPtr(); CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE, new_srcofs, new_dstofs, new_sz, new_srcstep[0], 0, new_dststep[0], 0, ptr, 0, 0, 0)); } } } void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], const size_t dstofs[], const size_t dststep[], const size_t srcstep[]) const CV_OVERRIDE { if(!u) return; // there should be no user-visible CPU copies of the UMat which we are going to copy to CV_Assert(u->refcount == 0 || u->tempUMat()); size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); UMatDataAutoLock autolock(u); // if there is cached CPU copy of the GPU matrix, // we could use it as a destination. // we can do it in 2 cases: // 1. we overwrite the whole content // 2. we overwrite part of the matrix, but the GPU copy is out-of-date if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size)) { Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep); u->markHostCopyObsolete(false); u->markDeviceCopyObsolete(true); return; } CV_Assert( u->handle != 0 ); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { CV_DbgAssert(u->data == NULL || u->data == u->handle); Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0); if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE, u->handle, u->size, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()"); } clFinish(q); if( iscontinuous ) { memcpy((uchar*)u->handle + dstrawofs, srcptr, total); } else { // This code is from MatAllocator::upload() int isz[CV_MAX_DIM]; uchar* dstptr = (uchar*)u->handle; for( int i = 0; i < dims; i++ ) { CV_Assert( sz[i] <= (size_t)INT_MAX ); if( sz[i] == 0 ) return; if( dstofs ) dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); isz[i] = (int)sz[i]; } Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep); Mat dst(dims, isz, CV_8U, dstptr, dststep); const Mat* arrays[] = { &src, &dst }; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs, 2); size_t j, planesz = it.size; for( j = 0; j < it.nplanes; j++, ++it ) memcpy(ptrs[1], ptrs[0], planesz); } if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()"); clFinish(q); } } else #endif { if( iscontinuous ) { AlignedDataPtr alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT); cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)", (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str()); } else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS) { const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT; size_t new_dstrawofs = dstrawofs & ~(padding-1); size_t membuf_ofs = dstrawofs - new_dstrawofs; AlignedDataPtr2D alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2); uchar* ptr = alignedPtr.getAlignedPtr(); CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]); total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding); total = std::min(total, u->size - new_dstrawofs); /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n", (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs, (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/ CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, new_dstrawofs, total, ptr, 0, 0, 0)); for( size_t i = 0; i < new_sz[1]; i++ ) memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]); CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, new_dstrawofs, total, ptr, 0, 0, 0)); } else { AlignedDataPtr2D alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT); uchar* ptr = alignedPtr.getAlignedPtr(); CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE, new_dstofs, new_srcofs, new_sz, new_dststep[0], 0, new_srcstep[0], 0, ptr, 0, 0, 0)); } } u->markHostCopyObsolete(true); #ifdef HAVE_OPENCL_SVM if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) { // nothing } else #endif { u->markHostCopyObsolete(true); } u->markDeviceCopyObsolete(false); } void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[], const size_t srcofs[], const size_t srcstep[], const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE { if(!src || !dst) return; size_t total = 0, new_sz[] = {0, 0, 0}; size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep, total, new_sz, srcrawofs, new_srcofs, new_srcstep, dstrawofs, new_dstofs, new_dststep); UMatDataAutoLock src_autolock(src, dst); if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) ) { upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep); return; } if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) ) { download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep); dst->markHostCopyObsolete(false); #ifdef HAVE_OPENCL_SVM if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) { // nothing } else #endif { dst->markDeviceCopyObsolete(true); } return; } // there should be no user-visible CPU copies of the UMat which we are going to copy to CV_Assert(dst->refcount == 0); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); cl_int retval = CL_SUCCESS; #ifdef HAVE_OPENCL_SVM if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 || (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 && (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { Context& ctx = Context::getDefault(); const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); CV_DbgAssert(svmFns->isValid()); if( iscontinuous ) { CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n", (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total); cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE, (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, total, 0, NULL, NULL); CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()"); } else { clFinish(q); // This code is from MatAllocator::download()/upload() int isz[CV_MAX_DIM]; uchar* srcptr = (uchar*)src->handle; for( int i = 0; i < dims; i++ ) { CV_Assert( sz[i] <= (size_t)INT_MAX ); if( sz[i] == 0 ) return; if( srcofs ) srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); isz[i] = (int)sz[i]; } Mat m_src(dims, isz, CV_8U, srcptr, srcstep); uchar* dstptr = (uchar*)dst->handle; for( int i = 0; i < dims; i++ ) { if( dstofs ) dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); } Mat m_dst(dims, isz, CV_8U, dstptr, dststep); const Mat* arrays[] = { &m_src, &m_dst }; uchar* ptrs[2]; NAryMatIterator it(arrays, ptrs, 2); size_t j, planesz = it.size; for( j = 0; j < it.nplanes; j++, ++it ) memcpy(ptrs[1], ptrs[0], planesz); } } else { if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) { map(src, ACCESS_READ); upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep); unmap(src); } else { map(dst, ACCESS_WRITE); download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep); unmap(dst); } } } else #endif { if( iscontinuous ) { retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle, srcrawofs, dstrawofs, total, 0, 0, 0); CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)", (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str()); } else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS) { const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT; size_t new_srcrawofs = srcrawofs & ~(padding-1); size_t srcmembuf_ofs = srcrawofs - new_srcrawofs; size_t new_dstrawofs = dstrawofs & ~(padding-1); size_t dstmembuf_ofs = dstrawofs - new_dstrawofs; AlignedDataPtr2D srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2); AlignedDataPtr2D dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2); uchar* srcptr = srcBuf.getAlignedPtr(); uchar* dstptr = dstBuf.getAlignedPtr(); CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]); size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding); src_total = std::min(src_total, src->size - new_srcrawofs); size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding); dst_total = std::min(dst_total, dst->size - new_dstrawofs); CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE, new_srcrawofs, src_total, srcptr, 0, 0, 0)); CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE, new_dstrawofs, dst_total, dstptr, 0, 0, 0)); for( size_t i = 0; i < new_sz[1]; i++ ) memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0], srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]); CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE, new_dstrawofs, dst_total, dstptr, 0, 0, 0)); } else { CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle, new_srcofs, new_dstofs, new_sz, new_srcstep[0], 0, new_dststep[0], 0, 0, 0, 0)); } } if (retval == CL_SUCCESS) { CV_IMPL_ADD(CV_IMPL_OCL) } #ifdef HAVE_OPENCL_SVM if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) { // nothing } else #endif { dst->markHostCopyObsolete(true); } dst->markDeviceCopyObsolete(false); if( _sync ) { CV_OCL_DBG_CHECK(clFinish(q)); } } BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE { ocl::Context ctx = Context::getDefault(); if (ctx.empty()) return NULL; #ifdef HAVE_OPENCL_SVM if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0)) { return &ctx.getImpl()->getBufferPoolSVM(); } #endif if (id != NULL && strcmp(id, "HOST_ALLOC") == 0) { return &ctx.getImpl()->getBufferPoolHostPtr(); } if (id != NULL && strcmp(id, "OCL") != 0) { CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n"); } return &ctx.getImpl()->getBufferPool(); } MatAllocator* matStdAllocator; mutable cv::Mutex cleanupQueueMutex; mutable std::deque cleanupQueue; void flushCleanupQueue() const { if (!cleanupQueue.empty()) { std::deque q; { cv::AutoLock lock(cleanupQueueMutex); q.swap(cleanupQueue); } for (std::deque::const_iterator i = q.begin(); i != q.end(); ++i) { deallocate_(*i); } } } void addToCleanupQueue(UMatData* u) const { //TODO: Validation check: CV_Assert(!u->tempUMat()); { cv::AutoLock lock(cleanupQueueMutex); cleanupQueue.push_back(u); } } }; static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee { static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide) return g_allocator; } MatAllocator* getOpenCLAllocator() { CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_()) } }} // namespace cv::ocl namespace cv { // three funcs below are implemented in umatrix.cpp void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps, bool autoSteps = false ); void finalizeHdr(UMat& m); } // namespace cv namespace cv { namespace ocl { /* // Convert OpenCL buffer memory to UMat */ void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst) { int d = 2; int sizes[] = { rows, cols }; CV_Assert(0 <= d && d <= CV_MAX_DIM); dst.release(); dst.flags = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL; dst.usageFlags = USAGE_DEFAULT; setSize(dst, d, sizes, 0, true); dst.offset = 0; cl_mem memobj = (cl_mem)cl_mem_buffer; cl_mem_object_type mem_type = 0; CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0)); CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type); size_t total = 0; CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0)); CV_OCL_CHECK(clRetainMemObject(memobj)); CV_Assert((int)step >= cols * CV_ELEM_SIZE(type)); CV_Assert(total >= rows * step); // attach clBuffer to UMatData dst.u = new UMatData(getOpenCLAllocator()); dst.u->data = 0; dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER; // not allocated from any OpenCV buffer pool dst.u->flags = static_cast(0); dst.u->handle = cl_mem_buffer; dst.u->origdata = 0; dst.u->prevAllocator = 0; dst.u->size = total; finalizeHdr(dst); dst.addref(); return; } // convertFromBuffer() /* // Convert OpenCL image2d_t memory to UMat */ void convertFromImage(void* cl_mem_image, UMat& dst) { cl_mem clImage = (cl_mem)cl_mem_image; cl_mem_object_type mem_type = 0; CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0)); CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type); cl_image_format fmt = { 0, 0 }; CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0)); int depth = CV_8U; switch (fmt.image_channel_data_type) { case CL_UNORM_INT8: case CL_UNSIGNED_INT8: depth = CV_8U; break; case CL_SNORM_INT8: case CL_SIGNED_INT8: depth = CV_8S; break; case CL_UNORM_INT16: case CL_UNSIGNED_INT16: depth = CV_16U; break; case CL_SNORM_INT16: case CL_SIGNED_INT16: depth = CV_16S; break; case CL_SIGNED_INT32: depth = CV_32S; break; case CL_FLOAT: depth = CV_32F; break; default: CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type"); } int type = CV_8UC1; switch (fmt.image_channel_order) { case CL_R: type = CV_MAKE_TYPE(depth, 1); break; case CL_RGBA: case CL_BGRA: case CL_ARGB: type = CV_MAKE_TYPE(depth, 4); break; default: CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order"); break; } size_t step = 0; CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0)); size_t w = 0; CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0)); size_t h = 0; CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0)); dst.create((int)h, (int)w, type); cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ); cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); size_t offset = 0; size_t src_origin[3] = { 0, 0, 0 }; size_t region[3] = { w, h, 1 }; CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL)); CV_OCL_CHECK(clFinish(q)); return; } // convertFromImage() ///////////////////////////////////////////// Utility functions ///////////////////////////////////////////////// static void getDevices(std::vector& devices, cl_platform_id platform) { cl_uint numDevices = 0; cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices); if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices { CV_OCL_DBG_CHECK_RESULT(status, cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str()); } if (numDevices == 0) { devices.clear(); return; } devices.resize((size_t)numDevices); CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices)); } struct PlatformInfo::Impl { Impl(void* id) { refcount = 1; handle = *(cl_platform_id*)id; getDevices(devices, handle); version_ = getStrProp(CL_PLATFORM_VERSION); parseOpenCLVersion(version_, versionMajor_, versionMinor_); } String getStrProp(cl_platform_info prop) const { char buf[1024]; size_t sz=0; return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS && sz < sizeof(buf) ? String(buf) : String(); } IMPLEMENT_REFCOUNTABLE(); std::vector devices; cl_platform_id handle; String version_; int versionMajor_; int versionMinor_; }; PlatformInfo::PlatformInfo() CV_NOEXCEPT { p = 0; } PlatformInfo::PlatformInfo(void* platform_id) { p = new Impl(platform_id); } PlatformInfo::~PlatformInfo() { if(p) p->release(); } PlatformInfo::PlatformInfo(const PlatformInfo& i) { if (i.p) i.p->addref(); p = i.p; } PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i) { if (i.p != p) { if (i.p) i.p->addref(); if (p) p->release(); p = i.p; } return *this; } PlatformInfo::PlatformInfo(PlatformInfo&& i) CV_NOEXCEPT { p = i.p; i.p = nullptr; } PlatformInfo& PlatformInfo::operator = (PlatformInfo&& i) CV_NOEXCEPT { if (this != &i) { if(p) p->release(); p = i.p; i.p = nullptr; } return *this; } int PlatformInfo::deviceNumber() const { return p ? (int)p->devices.size() : 0; } void PlatformInfo::getDevice(Device& device, int d) const { CV_Assert(p && d < (int)p->devices.size() ); if(p) device.set(p->devices[d]); } String PlatformInfo::name() const { return p ? p->getStrProp(CL_PLATFORM_NAME) : String(); } String PlatformInfo::vendor() const { return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String(); } String PlatformInfo::version() const { return p ? p->version_ : String(); } int PlatformInfo::versionMajor() const { CV_Assert(p); return p->versionMajor_; } int PlatformInfo::versionMinor() const { CV_Assert(p); return p->versionMinor_; } static void getPlatforms(std::vector& platforms) { cl_uint numPlatforms = 0; CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms)); if (numPlatforms == 0) { platforms.clear(); return; } platforms.resize((size_t)numPlatforms); CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms)); } void getPlatfomsInfo(std::vector& platformsInfo) { std::vector platforms; getPlatforms(platforms); for (size_t i = 0; i < platforms.size(); i++) platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) ); } const char* typeToStr(int type) { static const char* tab[]= { "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16", "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16", "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1]; CV_Assert(result); return result; } const char* memopTypeToStr(int type) { static const char* tab[] = { "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16", "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1]; CV_Assert(result); return result; } const char* vecopTypeToStr(int type) { static const char* tab[] = { "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4", "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4", "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8", "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8", "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16", "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1]; CV_Assert(result); return result; } const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) { if( sdepth == ddepth ) return "noconvert"; const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn)); if( ddepth >= CV_32F || (ddepth == CV_32S && sdepth < CV_32S) || (ddepth == CV_16S && sdepth <= CV_8S) || (ddepth == CV_16U && sdepth == CV_8U)) { sprintf(buf, "convert_%s", typestr); } else if( sdepth >= CV_32F ) sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : "")); else sprintf(buf, "convert_%s_sat", typestr); return buf; } const char* getOpenCLErrorString(int errorCode) { #define CV_OCL_CODE(id) case id: return #id #define CV_OCL_CODE_(id, name) case id: return #name switch (errorCode) { CV_OCL_CODE(CL_SUCCESS); CV_OCL_CODE(CL_DEVICE_NOT_FOUND); CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE); CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE); CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE); CV_OCL_CODE(CL_OUT_OF_RESOURCES); CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY); CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE); CV_OCL_CODE(CL_MEM_COPY_OVERLAP); CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH); CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED); CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE); CV_OCL_CODE(CL_MAP_FAILURE); CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET); CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE); CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE); CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE); CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED); CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE); CV_OCL_CODE(CL_INVALID_VALUE); CV_OCL_CODE(CL_INVALID_DEVICE_TYPE); CV_OCL_CODE(CL_INVALID_PLATFORM); CV_OCL_CODE(CL_INVALID_DEVICE); CV_OCL_CODE(CL_INVALID_CONTEXT); CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES); CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE); CV_OCL_CODE(CL_INVALID_HOST_PTR); CV_OCL_CODE(CL_INVALID_MEM_OBJECT); CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); CV_OCL_CODE(CL_INVALID_IMAGE_SIZE); CV_OCL_CODE(CL_INVALID_SAMPLER); CV_OCL_CODE(CL_INVALID_BINARY); CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS); CV_OCL_CODE(CL_INVALID_PROGRAM); CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE); CV_OCL_CODE(CL_INVALID_KERNEL_NAME); CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION); CV_OCL_CODE(CL_INVALID_KERNEL); CV_OCL_CODE(CL_INVALID_ARG_INDEX); CV_OCL_CODE(CL_INVALID_ARG_VALUE); CV_OCL_CODE(CL_INVALID_ARG_SIZE); CV_OCL_CODE(CL_INVALID_KERNEL_ARGS); CV_OCL_CODE(CL_INVALID_WORK_DIMENSION); CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE); CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE); CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET); CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST); CV_OCL_CODE(CL_INVALID_EVENT); CV_OCL_CODE(CL_INVALID_OPERATION); CV_OCL_CODE(CL_INVALID_GL_OBJECT); CV_OCL_CODE(CL_INVALID_BUFFER_SIZE); CV_OCL_CODE(CL_INVALID_MIP_LEVEL); CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE); // OpenCL 1.1 CV_OCL_CODE(CL_INVALID_PROPERTY); // OpenCL 1.2 CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR); CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS); CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS); CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT); // OpenCL 2.0 CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE); CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE); // Extensions CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR); CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR); CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR); CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR); CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR); CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR); default: return "Unknown OpenCL error"; } #undef CV_OCL_CODE #undef CV_OCL_CODE_ } template static std::string kerToStr(const Mat & k) { int width = k.cols - 1, depth = k.depth(); const T * const data = k.ptr(); std::ostringstream stream; stream.precision(10); if (depth <= CV_8S) { for (int i = 0; i < width; ++i) stream << "DIG(" << (int)data[i] << ")"; stream << "DIG(" << (int)data[width] << ")"; } else if (depth == CV_32F) { stream.setf(std::ios_base::showpoint); for (int i = 0; i < width; ++i) stream << "DIG(" << data[i] << "f)"; stream << "DIG(" << data[width] << "f)"; } else { for (int i = 0; i < width; ++i) stream << "DIG(" << data[i] << ")"; stream << "DIG(" << data[width] << ")"; } return stream.str(); } String kernelToStr(InputArray _kernel, int ddepth, const char * name) { Mat kernel = _kernel.getMat().reshape(1, 1); int depth = kernel.depth(); if (ddepth < 0) ddepth = depth; if (ddepth != depth) kernel.convertTo(kernel, ddepth); typedef std::string (* func_t)(const Mat &); static const func_t funcs[] = { kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, kerToStr, 0 }; const func_t func = funcs[ddepth]; CV_Assert(func != 0); return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str()); } #define PROCESS_SRC(src) \ do \ { \ if (!src.empty()) \ { \ CV_Assert(src.isMat() || src.isUMat()); \ Size csize = src.size(); \ int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \ ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \ if (cwidth < ckercn || ckercn <= 0) \ return 1; \ cols.push_back(cwidth); \ if (strat == OCL_VECTOR_OWN && ctype != ref_type) \ return 1; \ offsets.push_back(src.offset()); \ steps.push_back(src.step()); \ dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \ kercns.push_back(ckercn); \ } \ } \ while ((void)0, 0) int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, InputArray src4, InputArray src5, InputArray src6, InputArray src7, InputArray src8, InputArray src9, OclVectorStrategy strat) { const ocl::Device & d = ocl::Device::getDefault(); int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), d.preferredVectorWidthDouble(), -1 }; // if the device says don't use vectors if (vectorWidths[0] == 1) { // it's heuristic vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4; vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2; vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1; } return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat); } int checkOptimalVectorWidth(const int *vectorWidths, InputArray src1, InputArray src2, InputArray src3, InputArray src4, InputArray src5, InputArray src6, InputArray src7, InputArray src8, InputArray src9, OclVectorStrategy strat) { CV_Assert(vectorWidths); int ref_type = src1.type(); std::vector offsets, steps, cols; std::vector dividers, kercns; PROCESS_SRC(src1); PROCESS_SRC(src2); PROCESS_SRC(src3); PROCESS_SRC(src4); PROCESS_SRC(src5); PROCESS_SRC(src6); PROCESS_SRC(src7); PROCESS_SRC(src8); PROCESS_SRC(src9); size_t size = offsets.size(); for (size_t i = 0; i < size; ++i) while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0) dividers[i] >>= 1, kercns[i] >>= 1; // default strategy int kercn = *std::min_element(kercns.begin(), kercns.end()); return kercn; } int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3, InputArray src4, InputArray src5, InputArray src6, InputArray src7, InputArray src8, InputArray src9) { return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX); } #undef PROCESS_SRC // TODO Make this as a method of OpenCL "BuildOptions" class void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m) { if (!buildOptions.empty()) buildOptions += " "; int type = _m.type(), depth = CV_MAT_DEPTH(type); buildOptions += format( "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d", name.c_str(), ocl::typeToStr(type), name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), name.c_str(), (int)CV_MAT_CN(type), name.c_str(), (int)CV_ELEM_SIZE(type), name.c_str(), (int)CV_ELEM_SIZE1(type), name.c_str(), (int)depth ); } struct Image2D::Impl { Impl(const UMat &src, bool norm, bool alias) { handle = 0; refcount = 1; init(src, norm, alias); } ~Impl() { if (handle) clReleaseMemObject(handle); } static cl_image_format getImageFormat(int depth, int cn, bool norm) { cl_image_format format; static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16, CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 }; static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16, CL_SNORM_INT16, -1, -1, -1, -1 }; static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA }; int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth]; int channelOrder = channelOrders[cn]; format.image_channel_data_type = (cl_channel_type)channelType; format.image_channel_order = (cl_channel_order)channelOrder; return format; } static bool isFormatSupported(cl_image_format format) { if (!haveOpenCL()) CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!"); cl_context context = (cl_context)Context::getDefault().ptr(); if (!context) return false; // Figure out how many formats are supported by this context. cl_uint numFormats = 0; cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, numFormats, NULL, &numFormats); CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)"); if (numFormats > 0) { AutoBuffer formats(numFormats); err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, numFormats, formats.data(), NULL); CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)"); for (cl_uint i = 0; i < numFormats; ++i) { if (!memcmp(&formats[i], &format, sizeof(format))) { return true; } } } return false; } void init(const UMat &src, bool norm, bool alias) { if (!haveOpenCL()) CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!"); CV_Assert(!src.empty()); CV_Assert(ocl::Device::getDefault().imageSupport()); int err, depth = src.depth(), cn = src.channels(); CV_Assert(cn <= 4); cl_image_format format = getImageFormat(depth, cn, norm); if (!isFormatSupported(format)) CV_Error(Error::OpenCLApiCallError, "Image format is not supported"); if (alias && !src.handle(ACCESS_RW)) CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null"); cl_context context = (cl_context)Context::getDefault().ptr(); cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr(); #ifdef CL_VERSION_1_2 // this enables backwards portability to // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support const Device & d = ocl::Device::getDefault(); int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor(); CV_Assert(!alias || canCreateAlias(src)); if (1 < major || (1 == major && 2 <= minor)) { cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = src.cols; desc.image_height = src.rows; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = alias ? src.step[0] : 0; desc.image_slice_pitch = 0; desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0; desc.num_mip_levels = 0; desc.num_samples = 0; handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); } else #endif { CV_SUPPRESS_DEPRECATED_START CV_Assert(!alias); // This is an OpenCL 1.2 extension handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err); CV_SUPPRESS_DEPRECATED_END } CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()"); size_t origin[] = { 0, 0, 0 }; size_t region[] = { static_cast(src.cols), static_cast(src.rows), 1 }; cl_mem devData; if (!alias && !src.isContinuous()) { devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err); CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p", (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData ).c_str()); const size_t roi[3] = {static_cast(src.cols) * src.elemSize(), static_cast(src.rows), 1}; CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin, roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL)); CV_OCL_DBG_CHECK(clFlush(queue)); } else { devData = (cl_mem)src.handle(ACCESS_READ); } CV_Assert(devData != NULL); if (!alias) { CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0)); if (!src.isContinuous()) { CV_OCL_DBG_CHECK(clFlush(queue)); CV_OCL_DBG_CHECK(clReleaseMemObject(devData)); } } } IMPLEMENT_REFCOUNTABLE(); cl_mem handle; }; Image2D::Image2D() CV_NOEXCEPT { p = NULL; } Image2D::Image2D(const UMat &src, bool norm, bool alias) { p = new Impl(src, norm, alias); } bool Image2D::canCreateAlias(const UMat &m) { bool ret = false; const Device & d = ocl::Device::getDefault(); if (d.imageFromBufferSupport() && !m.empty()) { // This is the required pitch alignment in pixels uint pitchAlign = d.imagePitchAlignment(); if (pitchAlign && !(m.step % (pitchAlign * m.elemSize()))) { // We don't currently handle the case where the buffer was created // with CL_MEM_USE_HOST_PTR if (!m.u->tempUMat()) { ret = true; } } } return ret; } bool Image2D::isFormatSupported(int depth, int cn, bool norm) { cl_image_format format = Impl::getImageFormat(depth, cn, norm); return Impl::isFormatSupported(format); } Image2D::Image2D(const Image2D & i) { p = i.p; if (p) p->addref(); } Image2D & Image2D::operator = (const Image2D & i) { if (i.p != p) { if (i.p) i.p->addref(); if (p) p->release(); p = i.p; } return *this; } Image2D::Image2D(Image2D&& i) CV_NOEXCEPT { p = i.p; i.p = nullptr; } Image2D& Image2D::operator = (Image2D&& i) CV_NOEXCEPT { if (this != &i) { if (p) p->release(); p = i.p; i.p = nullptr; } return *this; } Image2D::~Image2D() { if (p) p->release(); } void* Image2D::ptr() const { return p ? p->handle : 0; } bool internal::isOpenCLForced() { static bool initialized = false; static bool value = false; if (!initialized) { value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false); initialized = true; } return value; } bool internal::isPerformanceCheckBypassed() { static bool initialized = false; static bool value = false; if (!initialized) { value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false); initialized = true; } return value; } bool internal::isCLBuffer(UMat& u) { void* h = u.handle(ACCESS_RW); if (!h) return true; CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator()); #if 1 if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here return false; #else cl_mem_object_type type = 0; cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL); if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER) return false; #endif return true; } struct Timer::Impl { const Queue queue; Impl(const Queue& q) : queue(q) { } ~Impl(){} void start() { CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr())); timer.start(); } void stop() { CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr())); timer.stop(); } uint64 durationNS() const { return (uint64)(timer.getTimeSec() * 1e9); } TickMeter timer; }; Timer::Timer(const Queue& q) : p(new Impl(q)) { } Timer::~Timer() { delete p; } void Timer::start() { CV_Assert(p); p->start(); } void Timer::stop() { CV_Assert(p); p->stop(); } uint64 Timer::durationNS() const { CV_Assert(p); return p->durationNS(); } }} // namespace #endif // HAVE_OPENCL