// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP #define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP #include "error.hpp" #include "pointer.hpp" #include #include #include #include #include #include namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /* @brief smart device pointer with allocation/deallocation methods * * ManagedPtr is a smart shared device pointer which also handles memory allocation. */ template class ManagedPtr { static_assert(!std::is_const::value && !std::is_volatile::value, "T cannot be cv-qualified"); static_assert(std::is_standard_layout::value, "T must satisfy StandardLayoutType"); public: using element_type = T; using pointer = DevicePtr; using const_pointer = DevicePtr::type>; using size_type = std::size_t; ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { } ManagedPtr(const ManagedPtr&) noexcept = default; ManagedPtr(ManagedPtr&& other) noexcept : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity } { other.reset(); } /** allocates device memory for \p count number of element */ ManagedPtr(size_type count) { if (count <= 0) { CV_Error(Error::StsBadArg, "number of elements is zero or negative"); } void* temp = nullptr; CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type))); auto ptr = typename pointer::pointer(static_cast(temp)); wrapped.reset(ptr, [](element_type* ptr) { if (ptr != nullptr) { /* contract violation for std::shared_ptr if cudaFree throws */ try { CUDA4DNN_CHECK_CUDA(cudaFree(ptr)); } catch (const CUDAException& ex) { std::ostringstream os; os << "Device memory deallocation failed in deleter.\n"; os << ex.what(); os << "Exception will be ignored.\n"; CV_LOG_WARNING(0, os.str().c_str()); } } }); /* std::shared_ptr::reset invokves the deleter if an exception occurs; hence, we don't * need to have a try-catch block to free the allocated device memory */ n = capacity = count; } ManagedPtr& operator=(ManagedPtr&& other) noexcept { wrapped = std::move(other.wrapped); n = other.n; capacity = other.capacity; other.reset(); return *this; } size_type size() const noexcept { return n; } void reset() noexcept { wrapped.reset(); n = capacity = 0; } /** * deallocates any previously allocated memory and allocates device memory * for \p count number of elements * * @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it * @note use move constructor to guarantee a deallocation of the previously allocated memory * * Exception Guarantee: Strong */ void reset(size_type count) { /* we need to fully own the memory to perform optimizations */ if (wrapped.use_count() == 1) { /* avoid reallocation if the existing capacity is sufficient */ if (count <= capacity) { n = count; return; } } /* no optimization performed; allocate memory */ ManagedPtr tmp(count); swap(tmp, *this); } pointer get() const noexcept { return pointer(wrapped.get()); } explicit operator bool() const noexcept { return wrapped; } friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; } friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; } friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept { using std::swap; swap(lhs.wrapped, rhs.wrapped); swap(lhs.n, rhs.n); swap(lhs.capacity, rhs.capacity); } private: std::shared_ptr wrapped; size_type n, capacity; }; /** copies entire memory block pointed by \p src to \p dest * * \param[in] src device pointer * \param[out] dest host pointer * * Pre-conditions: * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src * * Exception Guarantee: Basic */ template void memcpy(T *dest, const ManagedPtr& src) { memcpy(dest, src.get(), src.size()); } /** copies data from memory pointed by \p src to fully fill \p dest * * \param[in] src host pointer * \param[out] dest device pointer * * Pre-conditions: * - memory pointed by \p src must be at least as big as the memory block held by \p dest * * Exception Guarantee: Basic */ template void memcpy(const ManagedPtr& dest, const T* src) { memcpy(dest.get(), src, dest.size()); } /** copies data from memory pointed by \p src to \p dest * * if the two \p src and \p dest have different sizes, the number of elements copied is * equal to the size of the smaller memory block * * \param[in] src device pointer * \param[out] dest device pointer * * Exception Guarantee: Basic */ template void memcpy(const ManagedPtr& dest, const ManagedPtr& src) { memcpy(dest.get(), src.get(), std::min(dest.size(), src.size())); } /** sets device memory block to a specific 8-bit value * * \param[in] src device pointer * \param[out] ch 8-bit value to fill the device memory with * * Exception Guarantee: Basic */ template void memset(const ManagedPtr& dest, std::int8_t ch) { memset(dest.get(), ch, dest.size()); } /** copies entire memory block pointed by \p src to \p dest asynchronously * * \param[in] src device pointer * \param[out] dest host pointer * \param stream CUDA stream that has to be used for the memory transfer * * Pre-conditions: * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src * - \p dest points to page-locked memory * * Exception Guarantee: Basic */ template void memcpy(T *dest, const ManagedPtr& src, const Stream& stream) { CV_Assert(stream); memcpy(dest, src.get(), src.size(), stream); } /** copies data from memory pointed by \p src to \p dest asynchronously * * \param[in] src host pointer * \param[out] dest device pointer * \param stream CUDA stream that has to be used for the memory transfer * * Pre-conditions: * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src * - \p src points to page-locked memory * * Exception Guarantee: Basic */ template void memcpy(const ManagedPtr& dest, const T* src, const Stream& stream) { CV_Assert(stream); memcpy(dest.get(), src, dest.size(), stream); } /** copies data from memory pointed by \p src to \p dest asynchronously * * \param[in] src device pointer * \param[out] dest device pointer * \param stream CUDA stream that has to be used for the memory transfer * * if the two \p src and \p dest have different sizes, the number of elements copied is * equal to the size of the smaller memory block * * Exception Guarantee: Basic */ template void memcpy(ManagedPtr& dest, const ManagedPtr& src, const Stream& stream) { CV_Assert(stream); memcpy(dest.get(), src.get(), std::min(dest.size(), src.size()), stream); } /** sets device memory block to a specific 8-bit value asynchronously * * \param[in] src device pointer * \param[out] ch 8-bit value to fill the device memory with * \param stream CUDA stream that has to be used for the memory operation * * Exception Guarantee: Basic */ template void memset(const ManagedPtr& dest, int ch, const Stream& stream) { CV_Assert(stream); memset(dest.get(), ch, dest.size(), stream); } /** @brief registers host memory as page-locked and unregisters on destruction */ class MemoryLockGuard { public: MemoryLockGuard() noexcept : ptr { nullptr } { } MemoryLockGuard(const MemoryLockGuard&) = delete; MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } { other.ptr = nullptr; } /** page-locks \p size_in_bytes bytes of memory starting from \p ptr_ * * Pre-conditions: * - host memory should be unregistered */ MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) { CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable)); ptr = ptr_; } MemoryLockGuard& operator=(const MemoryLockGuard&) = delete; MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept { if (&other != this) { if(ptr != nullptr) { /* cudaHostUnregister does not throw for a valid ptr */ CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr)); } ptr = other.ptr; other.ptr = nullptr; } return *this; } ~MemoryLockGuard() { if(ptr != nullptr) { /* cudaHostUnregister does not throw for a valid ptr */ CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr)); } } private: void *ptr; }; }}}} /* namespace cv::dnn::cuda4dnn::csl */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */