added absSum function

ae529f4b · Alexey Spizhevoy · 9b4c6826 · ae529f4b · ae529f4b · ae529f4b
4 changed file
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -766,6 +766,14 @@ namespace cv
        //! supports only single channel images
        CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);

+        //! computes sum of array elements absolute values
+        //! supports only single channel images
+        CV_EXPORTS Scalar absSum(const GpuMat& src);
+
+        //! computes sum of array elements absolute values
+        //! supports only single channel images
+        CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
+
        //! computes squared sum of array elements
        //! supports only single channel images
        CV_EXPORTS Scalar sqrSum(const GpuMat& src);

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -953,6 +953,12 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename R> 
    struct IdentityOp { static __device__ R call(R x) { return x; } };

+    template <typename R> 
+    struct AbsOp { static __device__ R call(R x) { return abs(x); } };
+
+    template <>
+    struct AbsOp<uint> { static __device__ uint call(uint x) { return x; } };
+
    template <typename R> 
    struct SqrOp { static __device__ R call(R x) { return x * x; } };

@@ -1509,6 +1515,110 @@ namespace cv { namespace gpu { namespace mathfunc
    template void sumCaller<float>(const DevMem2D, PtrStep, double*, int);


+    template <typename T>
+    void absSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sums;
+        typedef typename SumType<T>::R R;
+
+        dim3 threads, grid;
+        estimateThreadCfg(src.cols, src.rows, threads, grid);
+        setKernelConsts(src.cols, src.rows, threads, grid);
+
+        switch (cn)
+        {
+        case 1:
+            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 2:
+            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 3:
+            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        case 4:
+            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
+                    (typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
+            break;
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }  
+
+    template void absSumMultipassCaller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void absSumMultipassCaller<char>(const DevMem2D, PtrStep, double*, int);
+    template void absSumMultipassCaller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void absSumMultipassCaller<short>(const DevMem2D, PtrStep, double*, int);
+    template void absSumMultipassCaller<int>(const DevMem2D, PtrStep, double*, int);
+    template void absSumMultipassCaller<float>(const DevMem2D, PtrStep, double*, int);
+
+
+    template <typename T>
+    void absSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
+    {
+        using namespace sums;
+        typedef typename SumType<T>::R R;
+
+        dim3 threads, grid;
+        estimateThreadCfg(src.cols, src.rows, threads, grid);
+        setKernelConsts(src.cols, src.rows, threads, grid);
+
+        switch (cn)
+        {
+        case 1:
+            sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
+            break;
+        case 2:
+            sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
+            break;
+        case 3:
+            sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
+            break;
+        case 4:
+            sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
+                    src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
+            break;
+        }
+        cudaSafeCall(cudaThreadSynchronize());
+
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
+
+        sum[0] = result[0];
+        sum[1] = result[1];
+        sum[2] = result[2];
+        sum[3] = result[3];
+    }
+
+    template void absSumCaller<uchar>(const DevMem2D, PtrStep, double*, int);
+    template void absSumCaller<char>(const DevMem2D, PtrStep, double*, int);
+    template void absSumCaller<ushort>(const DevMem2D, PtrStep, double*, int);
+    template void absSumCaller<short>(const DevMem2D, PtrStep, double*, int);
+    template void absSumCaller<int>(const DevMem2D, PtrStep, double*, int);
+    template void absSumCaller<float>(const DevMem2D, PtrStep, double*, int);
+
+
    template <typename T>
    void sqrSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn)
    {

--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -52,6 +52,8 @@ double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::absSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
+Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_nogpu(); }
@@ -128,6 +130,12 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void sumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

+    template <typename T>
+    void absSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+
+    template <typename T>
+    void absSumMultipassCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);
+
    template <typename T>
    void sqrSumCaller(const DevMem2D src, PtrStep buf, double* sum, int cn);

@@ -166,7 +174,7 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)

    Size buf_size;
    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), 
-                            buf_size.width, buf_size.height); 
+                             buf_size.width, buf_size.height); 
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller* callers = multipass_callers;
@@ -182,6 +190,47 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 }


+Scalar cv::gpu::absSum(const GpuMat& src) 
+{
+    GpuMat buf;
+    return absSum(src, buf);
+}
+
+
+Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) 
+{
+    using namespace mathfunc;
+
+    typedef void (*Caller)(const DevMem2D, PtrStep, double*, int);
+
+    static Caller multipass_callers[7] = { 
+            absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, 
+            absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, 
+            absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 };
+
+    static Caller singlepass_callers[7] = { 
+            absSumCaller<unsigned char>, absSumCaller<char>, 
+            absSumCaller<unsigned short>, absSumCaller<short>, 
+            absSumCaller<int>, absSumCaller<float>, 0 };
+
+    Size buf_size;
+    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), 
+                             buf_size.width, buf_size.height); 
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+
+    Caller* callers = multipass_callers;
+    if (TargetArchs::builtWith(ATOMICS) && DeviceInfo().has(ATOMICS))
+        callers = singlepass_callers;
+
+    Caller caller = callers[src.depth()];
+    if (!caller) CV_Error(CV_StsBadArg, "absSum: unsupported type");
+
+    double result[4];
+    caller(src, buf, result, src.channels());
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+
+
 Scalar cv::gpu::sqrSum(const GpuMat& src) 
 {
    GpuMat buf;
@@ -222,6 +271,9 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
    return Scalar(result[0], result[1], result[2], result[3]);
 }

+
+
+
 ////////////////////////////////////////////////////////////////////////
 // Find min or max


--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -956,6 +956,10 @@ struct CV_GpuSumTest: CvTest
            int typemax = CV_32F;
            for (int type = CV_8U; type <= typemax; ++type) 
            {
+                //
+                // sum
+                //
+
                gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 2), src);
                a = sum(src);
                b = sum(GpuMat(src));
@@ -965,6 +969,7 @@ struct CV_GpuSumTest: CvTest
                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                    return;
                }
+
                gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 3), src);
                a = sum(src);
                b = sum(GpuMat(src));
@@ -974,6 +979,7 @@ struct CV_GpuSumTest: CvTest
                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                    return;
                }
+
                gen(1 + rand() % 500, 1 + rand() % 500, CV_MAKETYPE(type, 4), src);
                a = sum(src);
                b = sum(GpuMat(src));
@@ -983,6 +989,7 @@ struct CV_GpuSumTest: CvTest
                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                    return;
                }
+
                gen(1 + rand() % 500, 1 + rand() % 500, type, src);
                a = sum(src);
                b = sum(GpuMat(src));
@@ -992,6 +999,25 @@ struct CV_GpuSumTest: CvTest
                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
                    return;
                }
+
+                //
+                // absSum
+                //
+
+                gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);
+                b = absSum(GpuMat(src));
+                a = norm(src, NORM_L1);
+                if (abs(a[0] - b[0]) > src.size().area() * max_err)
+                {
+                    ts->printf(CvTS::CONSOLE, "type: %d, cols: %d, rows: %d, expected: %f, actual: %f\n", type, src.cols, src.rows, a[0], b[0]);
+                    ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT);
+                    return;
+                }
+
+                //
+                // sqrSum
+                //
+
                if (type != CV_8S)
                {
                    gen(1 + rand() % 200, 1 + rand() % 200, CV_MAKETYPE(type, 1), src);