Thrust allocator usage.

a21ede94 · Dan · 7a934f9e · a21ede94 · a21ede94 · a21ede94
3 changed file
--- a/modules/cudafeatures2d/src/cuda/orb.cu
+++ b/modules/cudafeatures2d/src/cuda/orb.cu
@@ -51,7 +51,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"
-
+#include "opencv2/core/cuda/utility.hpp"
 namespace cv { namespace cuda { namespace device
 {
    namespace orb
@@ -64,6 +64,16 @@ namespace cv { namespace cuda { namespace device
            thrust::device_ptr<int> loc_ptr(loc);
            thrust::device_ptr<float> response_ptr(response);
 #if THRUST_VERSION >= 100800
+#if THRUST_VERSION >= 100802
+            if (stream)
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+            else
+            {
+                thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+            }
+#else
            if(stream)
            {
                thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
@@ -71,6 +81,7 @@ namespace cv { namespace cuda { namespace device
            {
                thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
            }
+#endif
 #else
            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
 #endif

--- a/modules/cudaimgproc/src/cuda/gftt.cu
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@@ -47,7 +47,7 @@

 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/utility.hpp"
-
+#include <thrust/execution_policy.h>
 namespace cv { namespace cuda { namespace device
 {
    namespace gfft
@@ -91,12 +91,12 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );

-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+            cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );

            bindTexture(&eigTex, eig);

@@ -104,17 +104,18 @@ namespace cv { namespace cuda { namespace device
            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));

            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+                findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+                findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);

            cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-
            int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
+            cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
+            if (stream)
+                cudaSafeCall(cudaStreamSynchronize(stream));
+            else
+                cudaSafeCall( cudaDeviceSynchronize() );
            return std::min(count, max_count);
        }

@@ -128,13 +129,19 @@ namespace cv { namespace cuda { namespace device
        };


-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
        {
            bindTexture(&eigTex, eig);

            thrust::device_ptr<float2> ptr(corners);
-
+#if THRUST_VERSION >= 100802
+            if (stream)
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater());
+            else
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater());
+#else
            thrust::sort(ptr, ptr + count, EigGreater());
+#endif
        }
    } // namespace optical_flow
 }}}

--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace gfft
    {
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, cudaStream_t stream);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
    }
 }}}

@@ -97,9 +97,6 @@ namespace

    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask, Stream& stream)
    {
-        // TODO : implement async version
-        (void) stream;
-
        using namespace cv::cuda::device::gfft;

        GpuMat image = _image.getGpuMat();
@@ -108,14 +105,14 @@ namespace
        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );

        ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
-        cornerCriteria_->compute(image, eig_);
+        cornerCriteria_->compute(image, eig_, stream);

        double maxVal = 0;
        cuda::minMax(eig_, 0, &maxVal);
-
+        cudaStream_t stream_ = StreamAccessor::getStream(stream);
        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

-        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, stream_);

        if (total == 0)
        {
@@ -123,18 +120,18 @@ namespace
            return;
        }

-        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);

        if (minDistance_ < 1)
        {
-            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners);
+            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners, stream);
        }
        else
        {
            std::vector<Point2f> tmp(total);
            Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
-            tmpCorners_.colRange(0, total).download(tmpMat);
-
+            tmpCorners_.colRange(0, total).download(tmpMat, stream);
+            stream.waitForCompletion();
            std::vector<Point2f> tmp2;
            tmp2.reserve(total);

@@ -203,7 +200,7 @@ namespace
            _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
            GpuMat corners = _corners.getGpuMat();

-            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]), stream);
        }
    }
 }