diff --git a/modules/cudafeatures2d/src/cuda/orb.cu b/modules/cudafeatures2d/src/cuda/orb.cu index 926c80a9b45df0ead0ab3ffd2c519e05d4072743..182ca4fb867408b721c7fcea483b85d91b9cd2d9 100644 --- a/modules/cudafeatures2d/src/cuda/orb.cu +++ b/modules/cudafeatures2d/src/cuda/orb.cu @@ -51,7 +51,7 @@ #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/reduce.hpp" #include "opencv2/core/cuda/functional.hpp" - +#include "opencv2/core/cuda/utility.hpp" namespace cv { namespace cuda { namespace device { namespace orb @@ -64,6 +64,16 @@ namespace cv { namespace cuda { namespace device thrust::device_ptr loc_ptr(loc); thrust::device_ptr response_ptr(response); #if THRUST_VERSION >= 100800 +#if THRUST_VERSION >= 100802 + if (stream) + { + thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + } + else + { + thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); + } +#else if(stream) { thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater()); @@ -71,6 +81,7 @@ namespace cv { namespace cuda { namespace device { thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); } +#endif #else thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater()); #endif diff --git a/modules/cudaimgproc/src/cuda/gftt.cu b/modules/cudaimgproc/src/cuda/gftt.cu index 029df41ce84d281ac1d53f8d0652f00022f957f1..ab8713f868a9c6f5ec836dd0ab093afc3e9596af 100644 --- a/modules/cudaimgproc/src/cuda/gftt.cu +++ b/modules/cudaimgproc/src/cuda/gftt.cu @@ -47,7 +47,7 @@ #include "opencv2/core/cuda/common.hpp" #include "opencv2/core/cuda/utility.hpp" - +#include namespace cv { namespace cuda { namespace device { namespace gfft @@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device } } - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count) + int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream) { void* counter_ptr; cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) ); - cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) ); + cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); bindTexture(&eigTex, eig); @@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y)); if (mask.data) - findCorners<<>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols); else - findCorners<<>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); + findCorners<<>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols); cudaSafeCall( cudaGetLastError() ); - cudaSafeCall( cudaDeviceSynchronize() ); - int count; - cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) ); - + cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); + if (stream) + cudaSafeCall(cudaStreamSynchronize(stream)); + else + cudaSafeCall( cudaDeviceSynchronize() ); return std::min(count, max_count); } @@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device }; - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count) + void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream) { bindTexture(&eigTex, eig); thrust::device_ptr ptr(corners); - +#if THRUST_VERSION >= 100802 + if (stream) + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater()); + else + thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater()); +#else thrust::sort(ptr, ptr + count, EigGreater()); +#endif } } // namespace optical_flow }}} diff --git a/modules/cudaimgproc/src/gftt.cpp b/modules/cudaimgproc/src/gftt.cpp index 73221c44d1d23dccc2b01e48e24f5f2b9a77802d..bf5d01b1174c798bd465dceac2a7134b7241ab15 100644 --- a/modules/cudaimgproc/src/gftt.cpp +++ b/modules/cudaimgproc/src/gftt.cpp @@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device { namespace gfft { - int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count); - void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count); + int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream); + void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream); } }}} @@ -97,9 +97,6 @@ namespace void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream) { - // TODO : implement async version - (void) stream; - using namespace cv::cuda::device::gfft; GpuMat image = _image.getGpuMat(); @@ -108,14 +105,14 @@ namespace CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) ); ensureSizeIsEnough(image.size(), CV_32FC1, eig_); - cornerCriteria_->compute(image, eig_); + cornerCriteria_->compute(image, eig_, stream); double maxVal = 0; cuda::minMax(eig_, 0, &maxVal); - + cudaStream_t stream_ = StreamAccessor::getStream(stream); ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); - int total = findCorners_gpu(eig_, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols); + int total = findCorners_gpu(eig_, static_cast(maxVal * qualityLevel_), mask, tmpCorners_.ptr(), tmpCorners_.cols, stream_); if (total == 0) { @@ -123,18 +120,18 @@ namespace return; } - sortCorners_gpu(eig_, tmpCorners_.ptr(), total); + sortCorners_gpu(eig_, tmpCorners_.ptr(), total, stream_); if (minDistance_ < 1) { - tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners); + tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream); } else { std::vector tmp(total); Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]); - tmpCorners_.colRange(0, total).download(tmpMat); - + tmpCorners_.colRange(0, total).download(tmpMat, stream); + stream.waitForCompletion(); std::vector tmp2; tmp2.reserve(total); @@ -203,7 +200,7 @@ namespace _corners.create(1, static_cast(tmp2.size()), CV_32FC2); GpuMat corners = _corners.getGpuMat(); - corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0])); + corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0]), stream); } } }