added first version of gpu::countNonZero for all data types, it doesn't...

added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs

added first version of gpu::countNonZero for all data types, it doesn't...
added first version of gpu::countNonZero for all data types, it doesn't support compute capability 1.0 yet, also fixed some little bugs
7e2cc1be · Alexey Spizhevoy · e470246a · 7e2cc1be · 7e2cc1be · 7e2cc1be
4 changed file
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -434,6 +434,11 @@ namespace cv
        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, 
                                  GpuMat& valbuf, GpuMat& locbuf);

+        //! counts non-zero array elements
+        CV_EXPORTS int countNonZero(const GpuMat& src);
+
+        //! counts non-zero array elements
+        CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);

        //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
        //! destination array will have the depth type as lut and the same channels number as source

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -69,6 +69,8 @@ void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); }
+int cv::gpu::countNonZero(const GpuMat&) { throw_nogpu(); return 0; }
+int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_nogpu(); return 0; }
 void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); }
@@ -527,7 +529,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
    int major, minor;
    getComputeCapability(getDevice(), major, minor);
 
-    if (major >= 1 && minor >= 1)
+    if (major > 1 || (major == 1 && minor >= 1))
    {
        switch (src_.type())
        {
@@ -538,7 +540,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
        case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;
        case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;
        case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;
-        default: CV_Error(CV_StsBadArg, "Unsupported type");
+        default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
        }
    }
    else
@@ -551,7 +553,7 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat&
        case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;
        case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;
        case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;
-        default: CV_Error(CV_StsBadArg, "Unsupported type");
+        default: CV_Error(CV_StsBadArg, "minMax: unsupported type");
        }
    }
 }
@@ -601,7 +603,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    int major, minor;
    getComputeCapability(getDevice(), major, minor);
 
-    if (major >= 1 && minor >= 1)
+    if (major > 1 || (major == 1 && minor >= 1))
    {  
        switch (src.type())
        {
@@ -612,7 +614,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
        case CV_32S: min_max_loc_caller<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
        case CV_32F: min_max_loc_caller<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
        case CV_64F: min_max_loc_caller<double>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
-        default: CV_Error(CV_StsBadArg, "Unsupported type");
+        default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
        }
    }
    else
@@ -625,7 +627,7 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
        case CV_16S: min_max_loc_caller_2steps<signed short>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
        case CV_32S: min_max_loc_caller_2steps<int>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
        case CV_32F: min_max_loc_caller_2steps<float>(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break;
-        default: CV_Error(CV_StsBadArg, "Unsupported type");
+        default: CV_Error(CV_StsBadArg, "minMaxLoc: unsupported type");
        }
    }

@@ -633,6 +635,51 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; }
 }

+////////////////////////////////////////////////////////////////////////
+// Count non zero
+
+namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
+
+    void get_buf_size_required(int& cols, int& rows);
+
+    template <typename T> 
+    int count_non_zero_caller(const DevMem2D src, PtrStep buf);
+
+    template <typename T> 
+    int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf);
+
+}}}}
+
+int cv::gpu::countNonZero(const GpuMat& src)
+{
+    GpuMat buf;
+    return countNonZero(src, buf);
+}
+
+int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
+{
+    using namespace mathfunc::countnonzero;
+    CV_Assert(src.channels() == 1);
+
+    Size buf_size;
+    get_buf_size_required(buf_size.width, buf_size.height);
+    buf.create(buf_size, CV_8U);
+
+    switch (src.type())
+    {
+    case CV_8U: return count_non_zero_caller<unsigned char>(src, buf);
+    case CV_8S: return count_non_zero_caller<signed char>(src, buf);
+    case CV_16U: return count_non_zero_caller<unsigned short>(src, buf);
+    case CV_16S: return count_non_zero_caller<signed short>(src, buf);
+    case CV_32S: return count_non_zero_caller<int>(src, buf);
+    case CV_32F: return count_non_zero_caller<float>(src, buf);
+    case CV_64F: return count_non_zero_caller<double>(src, buf);
+    }
+
+    CV_Error(CV_StsBadArg, "countNonZero: unsupported type");
+    return 0;
+}
+
 ////////////////////////////////////////////////////////////////////////
 // LUT


--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -615,6 +615,8 @@ namespace cv { namespace gpu { namespace mathfunc

    } // namespace minmax

+///////////////////////////////////////////////////////////////////////////////
+// minMaxLoc

    namespace minmaxloc {

@@ -868,4 +870,126 @@ namespace cv { namespace gpu { namespace mathfunc

    } // namespace minmaxloc

+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+// countNonZero
+
+    namespace countnonzero 
+    {
+
+    __constant__ int ctwidth;
+    __constant__ int ctheight;
+
+    static const unsigned int czero = 0;
+
+    __device__ unsigned int blocks_finished;
+
+    void estimate_thread_cfg(dim3& threads, dim3& grid)
+    {
+        threads = dim3(64, 4);
+        grid = dim3(6, 5);
+    }
+
+
+    void get_buf_size_required(int& cols, int& rows)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(threads, grid);
+        cols = grid.x * grid.y * sizeof(int);
+        rows = 1;
+    }
+
+
+    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
+    {        
+        int twidth = divUp(divUp(cols, grid.x), threads.x);
+        int theight = divUp(divUp(rows, grid.y), threads.y);
+        cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth))); 
+        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight))); 
+    }
+
+
+    template <int nthreads, typename T>
+    __global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
+    {
+        __shared__ unsigned int scount[nthreads];
+
+        unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
+        unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
+        unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+		unsigned int cnt = 0;
+        for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
+        {
+            const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
+            for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
+				cnt += ptr[x0 + x * blockDim.x] != 0;
+		}
+
+		scount[tid] = cnt;
+		__syncthreads();
+
+		for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
+		{
+			if (tid < step) scount[tid] += scount[tid + step];
+			__syncthreads();
+		}
+
+		__shared__ bool is_last;
+
+		if (tid == 0)
+		{
+			count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
+			__threadfence();
+
+			unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+			is_last = ticket == gridDim.x * gridDim.y - 1;
+		}
+
+		__syncthreads();
+
+		if (is_last)
+		{
+			scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+
+			for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
+			{
+				if (tid < step) scount[tid] += scount[tid + step];
+				__syncthreads();
+			}
+
+			if (tid == 0) count[0] = scount[0];
+		}
+
+    }
+
+   
+    template <typename T>
+    int count_non_zero_caller(const DevMem2D src, PtrStep buf)
+    {
+        dim3 threads, grid;
+        estimate_thread_cfg(threads, grid);
+        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+
+        unsigned int* count_buf = (unsigned int*)buf.ptr(0);
+
+        cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
+        count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
+        cudaSafeCall(cudaThreadSynchronize());
+
+        unsigned int count;
+        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
+        
+        return count;
+    }  
+
+    template int count_non_zero_caller<unsigned char>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<signed char>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<unsigned short>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<signed short>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
+    template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
+
+    } // namespace countnonzero
+
 }}}
--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -689,9 +689,7 @@ struct CV_GpuMinMaxTest: public CvTest
        for (int cn = 1; cn <= 4; ++cn)
            for (int depth = CV_8U; depth <= depth_end; ++depth)
            {
-                int rows = 1, cols = 3;
-                test(rows, cols, cn, depth);
-                for (int i = 0; i < 4; ++i)
+                for (int i = 0; i < 1; ++i)
                {
                    int rows = 1 + rand() % 1000;
                    int cols = 1 + rand() % 1000;
@@ -821,6 +819,59 @@ struct CV_GpuMinMaxLocTest: public CvTest
    }  
 };

+////////////////////////////////////////////////////////////////////////////
+// Count non zero
+struct CV_GpuCountNonZeroTest: CvTest 
+{
+    CV_GpuCountNonZeroTest(): CvTest("GPU-CountNonZeroTest", "countNonZero") {}
+
+    void run(int) 
+    {
+        srand(0);
+        int depth_end;
+        int major, minor;
+        cv::gpu::getComputeCapability(getDevice(), major, minor);
+
+        if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F;
+        for (int depth = CV_8U; depth <= depth_end; ++depth)
+        {
+            for (int i = 0; i < 4; ++i)
+            {
+                int rows = 1 + rand() % 1000;
+                int cols = 1 + rand() % 1000;
+                test(rows, cols, depth);
+            }
+        }
+    }
+
+    void test(int rows, int cols, int depth)
+    {
+        cv::Mat src(rows, cols, depth);
+        cv::RNG rng;
+        if (depth == 5)
+            rng.fill(src, RNG::UNIFORM, Scalar(-1000.f), Scalar(1000.f));
+        else if (depth == 6)
+            rng.fill(src, RNG::UNIFORM, Scalar(-1000.), Scalar(1000.));
+        else
+            for (int i = 0; i < src.rows; ++i)
+            { 
+                Mat row(1, src.cols * src.elemSize(), CV_8U, src.ptr(i));
+                rng.fill(row, RNG::UNIFORM, Scalar(0), Scalar(255));
+            }
+
+        int n_gold = cv::countNonZero(src);
+        int n = cv::gpu::countNonZero(cv::gpu::GpuMat(src));
+
+        if (n != n_gold)
+        {
+            ts->printf(CvTS::CONSOLE, "%d %d %d %d %d\n", n, n_gold, depth, cols, rows);
+            n_gold = cv::countNonZero(src);
+        }
+
+        CHECK(n == n_gold, CvTS::FAIL_INVALID_OUTPUT);
+    }
+};
+

 /////////////////////////////////////////////////////////////////////////////
 /////////////////// tests registration  /////////////////////////////////////
@@ -850,3 +901,4 @@ CV_GpuNppImageCartToPolarTest CV_GpuNppImageCartToPolar_test;
 CV_GpuNppImagePolarToCartTest CV_GpuNppImagePolarToCart_test;
 CV_GpuMinMaxTest CV_GpuMinMaxTest_test;
 CV_GpuMinMaxLocTest CV_GpuMinMaxLocTest_test;
+CV_GpuCountNonZeroTest CV_CountNonZero_test;