Merge pull request #2303 from jet47:gpu-canny-fix

1d68a899 · Roman Donchenko · OpenCV Buildbot · 48f95d92 · 5dbdadb7 · 1d68a899
Showing with 39 addition and 28 deletion

modules/gpu/perf/perf_imgproc.cpp modules/gpu/perf/perf_imgproc.cpp +2 -2

modules/gpu/src/cuda/canny.cu modules/gpu/src/cuda/canny.cu +29 -20

modules/gpu/src/imgproc.cpp modules/gpu/src/imgproc.cpp +8 -6

未找到文件。
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -672,7 +672,7 @@ PERF_TEST_P(Sz, ImgProc_ColumnSum,

 DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);

-PERF_TEST_P(Image_AppertureSz_L2gradient, DISABLED_ImgProc_Canny,
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny,
            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
                    Values(3, 5),
                    Bool()))
@@ -1777,7 +1777,7 @@ PERF_TEST_P(Image, ImgProc_HoughLinesP,

 DEF_PARAM_TEST(Sz_Dp_MinDist, cv::Size, float, float);

-PERF_TEST_P(Sz_Dp_MinDist, DISABLED_ImgProc_HoughCircles,
+PERF_TEST_P(Sz_Dp_MinDist, ImgProc_HoughCircles,
            Combine(GPU_TYPICAL_MAT_SIZES,
                    Values(1.0f, 2.0f, 4.0f),
                    Values(1.0f)))

--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -239,30 +239,35 @@ namespace canny
 {
    __device__ int counter = 0;

-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
+    {
+        return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
+    }
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
    {
        __shared__ volatile int smem[18][18];

        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
        if (threadIdx.y == 0)
-            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+            smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
        if (threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+            smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
        if (threadIdx.x == 0)
-            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+            smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1)
-            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+            smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
        if (threadIdx.x == 0 && threadIdx.y == 0)
-            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+            smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+            smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+            smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+            smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;

        __syncthreads();

@@ -317,11 +322,11 @@ namespace canny
        if (n > 0)
        {
            const int ind =  ::atomicAdd(&counter, 1);
-            st[ind] = make_ushort2(x, y);
+            st[ind] = make_short2(x, y);
        }
    }

-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
@@ -345,13 +350,13 @@ namespace canny
    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
    {
        const int stack_size = 512;

        __shared__ int s_counter;
        __shared__ int s_ind;
-        __shared__ ushort2 s_st[stack_size];
+        __shared__ short2 s_st[stack_size];

        if (threadIdx.x == 0)
            s_counter = 0;
@@ -363,14 +368,14 @@ namespace canny
        if (ind >= count)
            return;

-        ushort2 pos = st1[ind];
+        short2 pos = st1[ind];

        if (threadIdx.x < 8)
        {
            pos.x += c_dx[threadIdx.x];
            pos.y += c_dy[threadIdx.x];

-            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
            {
                map(pos.y, pos.x) = 2;

@@ -402,7 +407,7 @@ namespace canny
                pos.x += c_dx[threadIdx.x & 7];
                pos.y += c_dy[threadIdx.x & 7];

-                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+                if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
                {
                    map(pos.y, pos.x) = 2;

@@ -419,8 +424,10 @@ namespace canny
        {
            if (threadIdx.x == 0)
            {
-                ind = ::atomicAdd(&counter, s_counter);
-                s_ind = ind - s_counter;
+                s_ind = ::atomicAdd(&counter, s_counter);
+
+                if (s_ind + s_counter > map.cols * map.rows)
+                    s_counter = 0;
            }

            __syncthreads();
@@ -432,7 +439,7 @@ namespace canny
        }
    }

-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
@@ -454,6 +461,8 @@ namespace canny

            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );

+            count = min(count, map.cols * map.rows);
+
            std::swap(st1, st2);
        }
    }

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -1491,6 +1491,8 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,

 void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 {
+    CV_Assert(image_size.width < std::numeric_limits<short>::max() && image_size.height < std::numeric_limits<short>::max());
+
    if (apperture_size > 0)
    {
        ensureSizeIsEnough(image_size, CV_32SC1, dx);
@@ -1506,8 +1508,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
    ensureSizeIsEnough(image_size, CV_32FC1, mag);
    ensureSizeIsEnough(image_size, CV_32SC1, map);

-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1);
-    ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2);
+    ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st1);
+    ensureSizeIsEnough(1, image_size.area(), CV_16SC2, st2);
 }

 void cv::gpu::CannyBuf::release()
@@ -1527,9 +1529,9 @@ namespace canny

    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);

-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1);

-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2);

    void getEdges(PtrStepSzi map, PtrStepSzb dst);
 }
@@ -1543,9 +1545,9 @@ namespace
        buf.map.setTo(Scalar::all(0));
        calcMap(dx, dy, buf.mag, buf.map, low_thresh, high_thresh);

-        edgesHysteresisLocal(buf.map, buf.st1.ptr<ushort2>());
+        edgesHysteresisLocal(buf.map, buf.st1.ptr<short2>());

-        edgesHysteresisGlobal(buf.map, buf.st1.ptr<ushort2>(), buf.st2.ptr<ushort2>());
+        edgesHysteresisGlobal(buf.map, buf.st1.ptr<short2>(), buf.st2.ptr<short2>());

        getEdges(buf.map, dst);
    }