diff --git a/modules/dnn/src/cuda4dnn/csl/event.hpp b/modules/dnn/src/cuda4dnn/csl/event.hpp index 63da75a200c5510666f77a7424865f5fddc9d7de..e1917bedf569d8324567bd016e9be6976152983d 100644 --- a/modules/dnn/src/cuda4dnn/csl/event.hpp +++ b/modules/dnn/src/cuda4dnn/csl/event.hpp @@ -33,7 +33,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */ Event(bool create, bool timing_event = false) : event{nullptr} { if (create) { - unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming); + unsigned int flags = (timing_event ? 0 : cudaEventDisableTiming); CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags)); } } @@ -60,6 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { /** mark a point in \p stream */ void record(const Stream& stream) { + CV_Assert(stream); CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get())); } @@ -85,12 +86,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { }; /** makes a stream wait on an event */ - void StreamWaitOnEvent(const Stream& stream, const Event& event) { + inline void StreamWaitOnEvent(const Stream& stream, const Event& event) { + CV_Assert(stream); CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0)); } /** returns the time elapsed between two events in milliseconds */ - float TimeElapsedBetweenEvents(const Event& start, const Event& end) { + inline float TimeElapsedBetweenEvents(const Event& start, const Event& end) { float temp; CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get())); return temp; diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp index 88ba8ebdd648a04f79a533016956fa7cfb049a40..6734310f6b8f65abeb6326ad1442ad1c7927a165 100644 --- a/modules/dnn/src/dnn.cpp +++ b/modules/dnn/src/dnn.cpp @@ -585,6 +585,13 @@ struct LayerData std::vector > inputBlobsWrappers; std::vector > internalBlobsWrappers; +#ifdef HAVE_CUDA + /* output ids which must be transferred to the host in the background + * after the completion of the forward pass of the layer + */ + std::vector cudaD2HBackgroundTransfers; +#endif + Ptr layerInstance; std::vector outputBlobs; std::vector inputBlobs; @@ -1187,7 +1194,8 @@ struct Net::Impl : public detail::NetImplBase context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream); context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream); - cudaInfo = std::unique_ptr(new CudaInfo_t(std::move(context))); + auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers + cudaInfo = std::unique_ptr(new CudaInfo_t(std::move(context), std::move(d2h_stream))); } #endif } @@ -1215,8 +1223,10 @@ struct Net::Impl : public detail::NetImplBase #ifdef HAVE_CUDA struct CudaInfo_t { - CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { } + CudaInfo_t(cuda4dnn::csl::CSLContext ctxt, cuda4dnn::csl::Stream d2h_stream_) + : context(std::move(ctxt)), d2h_stream(std::move(d2h_stream_)) { } cuda4dnn::csl::CSLContext context; + cuda4dnn::csl::Stream d2h_stream; cuda4dnn::csl::Workspace workspace; }; @@ -1290,7 +1300,7 @@ struct Net::Impl : public detail::NetImplBase if (preferableBackend == DNN_BACKEND_CUDA) { auto cudaWrapper = wrapper.dynamicCast(); - cudaWrapper->setStream(cudaInfo->context.stream); + cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream); } #endif backendWrappers[data] = wrapper; @@ -1630,7 +1640,7 @@ struct Net::Impl : public detail::NetImplBase else if (preferableBackend == DNN_BACKEND_VKCOM) initVkComBackend(); else if (preferableBackend == DNN_BACKEND_CUDA) - initCUDABackend(); + initCUDABackend(blobsToKeep_); else CV_Error(Error::StsNotImplemented, "Unknown backend identifier"); } @@ -2360,7 +2370,7 @@ struct Net::Impl : public detail::NetImplBase #endif } - void initCUDABackend() { + void initCUDABackend(const std::vector& blobsToKeep_) { CV_Assert(haveCUDA()); #ifdef HAVE_CUDA @@ -2386,6 +2396,15 @@ struct Net::Impl : public detail::NetImplBase auto cudaNode = node.dynamicCast(); cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes()); } + + if (blobsToKeep_.size() > 1) + { + for (const auto& pin : blobsToKeep_) + { + LayerData& ld = layers[pin.lid]; + ld.cudaD2HBackgroundTransfers.push_back(pin.oid); + } + } #endif } @@ -3120,6 +3139,12 @@ struct Net::Impl : public detail::NetImplBase CV_Assert(!cudaNode.empty()); cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace); + + for (auto id : ld.cudaD2HBackgroundTransfers) + { + auto wrapper = ld.outputBlobsWrappers[id].dynamicCast(); + wrapper->copyToHostInBackground(); + } #endif } else if (preferableBackend == DNN_BACKEND_HALIDE) diff --git a/modules/dnn/src/op_cuda.hpp b/modules/dnn/src/op_cuda.hpp index 53ef7739cf53c177656769cd0db6011830912ac8..0ce4d469fc8383e6e57a27ae9a4e386582e069a0 100644 --- a/modules/dnn/src/op_cuda.hpp +++ b/modules/dnn/src/op_cuda.hpp @@ -7,6 +7,7 @@ #ifdef HAVE_CUDA #include "cuda4dnn/csl/stream.hpp" +#include "cuda4dnn/csl/event.hpp" #include "cuda4dnn/csl/cublas.hpp" #include "cuda4dnn/csl/cudnn.hpp" #include "cuda4dnn/csl/tensor.hpp" @@ -206,6 +207,7 @@ namespace cv { namespace dnn { virtual ~CUDABackendWrapper() { } void copyToHost() override = 0; + virtual void copyToHostInBackground() = 0; void setHostDirty() override = 0; virtual void copyToDevice() = 0; @@ -215,7 +217,7 @@ namespace cv { namespace dnn { virtual std::size_t getRank() const noexcept = 0; /** @note setting the stream updates the stream for all wrappers which use the same tensor */ - virtual void setStream(cuda4dnn::csl::Stream stream) noexcept = 0; + virtual void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream h2d_stream) noexcept = 0; virtual void update(const MatShape& shape, std::size_t offset) = 0; }; @@ -240,6 +242,36 @@ namespace cv { namespace dnn { cuda4dnn::csl::memcpy(reinterpret_cast(mat.data), view.data(), view.size(), stream); } + template + void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View view, cuda4dnn::csl::ManagedPtr& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event); + + template <> inline + void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View view, cuda4dnn::csl::ManagedPtr& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) { + if (device_temp.size() < view.size()) + device_temp.reset(view.size()); + auto temp_span = cuda4dnn::csl::Span(device_temp.get(), view.size()); + + /* The conversion kernel should can be executed in the background stream for better + * performance. We do it in the inference stream to prevent an unexplained performance + * regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes + * everything to slow down (even operations that appear before the background transfer). + * + * TODO: identify the cause and move conversion kernel to the background stream + */ + cuda4dnn::kernels::fp16_to_fp32(stream, temp_span, view); + + d2h_event.record(stream); // mark position in inference stream + cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); // don't start transfer until data is available + cuda4dnn::csl::memcpy(reinterpret_cast(mat.data), temp_span.data(), view.size(), d2h_stream); + } + + template <> inline + void convert_D2H_background(const cv::Mat& mat, cuda4dnn::csl::View view, cuda4dnn::csl::ManagedPtr& device_temp, const cuda4dnn::csl::Stream& stream, const cuda4dnn::csl::Stream& d2h_stream, cuda4dnn::csl::Event& d2h_event) { + d2h_event.record(stream); + cuda4dnn::csl::StreamWaitOnEvent(d2h_stream, d2h_event); + cuda4dnn::csl::memcpy(reinterpret_cast(mat.data), view.data(), view.size(), d2h_stream); + } + template void convert_H2D(cuda4dnn::csl::Span span, const cv::Mat& mat, cuda4dnn::csl::ManagedPtr& device_temp, const cuda4dnn::csl::Stream& stream); @@ -349,6 +381,28 @@ namespace cv { namespace dnn { cuda4dnn::detail::convert_D2H(mat, view, shared_block->device_temp, shared_block->stream); shared_block->stream.synchronize(); + } else if(shared_block->d2h_event && shared_block->d2h_event.busy()) { + /* wait for the background copy to finish */ + shared_block->d2h_event.synchronize(); + } + } + + void copyToHostInBackground() override { + CV_Assert(shared_block->d2h_stream); + if (shared_block->device_dirty) { + shared_block->host_dirty = false; + shared_block->device_dirty = false; + + auto view = tensor_view_type(shared_block->device.get(), std::begin(shape), std::end(shape)); + + auto& mat = shared_block->host; + CV_Assert(mat.isContinuous()); + CV_Assert(mat.type() == CV_32F); + + if (!shared_block->d2h_event) + shared_block->d2h_event = cuda4dnn::csl::Event(true); + cuda4dnn::detail::convert_D2H_background(mat, view, shared_block->device_temp, shared_block->stream, shared_block->d2h_stream, shared_block->d2h_event); + shared_block->d2h_event.record(shared_block->d2h_stream); // record position so that we can check status later } } @@ -383,8 +437,9 @@ namespace cv { namespace dnn { std::size_t getRank() const noexcept override { return shape.size(); } - void setStream(cuda4dnn::csl::Stream stream) noexcept override { + void setStream(cuda4dnn::csl::Stream stream, cuda4dnn::csl::Stream d2h_stream) noexcept override { shared_block->stream = std::move(stream); + shared_block->d2h_stream = std::move(d2h_stream); } void update(const MatShape& shape_, std::size_t offset_) override { @@ -452,6 +507,9 @@ namespace cv { namespace dnn { cuda4dnn::csl::ManagedPtr device; cuda4dnn::csl::ManagedPtr device_temp; /* use for conversions */ cuda4dnn::csl::Stream stream; + + cuda4dnn::csl::Event d2h_event; + cuda4dnn::csl::Stream d2h_stream; }; std::shared_ptr shared_block;