From da5aaab27788d32a9f4118b4a90667000c8282aa Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 22 Aug 2012 12:11:11 +0400
Subject: [PATCH] optimized gpu::integral for Kepler

---
 modules/core/include/opencv2/core/gpumat.hpp |   4 +-
 modules/gpu/src/cuda/integral_image.cu       | 385 +++++++++++++++++++
 modules/gpu/src/imgproc.cpp                  |  92 ++++-
 3 files changed, 461 insertions(+), 20 deletions(-)
 create mode 100644 modules/gpu/src/cuda/integral_image.cu

diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp
index 9893359253..e09f1bc4c3 100644
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -72,9 +72,11 @@ namespace cv { namespace gpu
         FEATURE_SET_COMPUTE_13 = 13,
         FEATURE_SET_COMPUTE_20 = 20,
         FEATURE_SET_COMPUTE_21 = 21,
+        FEATURE_SET_COMPUTE_30 = 30,
         GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11,
         SHARED_ATOMICS = FEATURE_SET_COMPUTE_12,
-        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13
+        NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13,
+        WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30
     };
 
     // Gives information about what GPU archs this OpenCV GPU module was
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
new file mode 100644
index 0000000000..ead0ddefec
--- /dev/null
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -0,0 +1,385 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in && 0x000000ff) >>  0;
+            bytes.y = (in && 0x0000ff00) >>  8;
+            bytes.z = (in && 0x00ff0000) >> 16;
+            bytes.w = (in && 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        {
+        #if __CUDA_ARCH__ >= 300
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        {
+        #if __CUDA_ARCH__ >= 300
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                const int block = img.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index fe1ad7bd89..5b8fb9c057 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -223,7 +223,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q,
     using namespace cv::gpu::device::imgproc;
 
     typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream);
-    static const func_t funcs[2][4] = 
+    static const func_t funcs[2][4] =
     {
         {reprojectImageTo3D_gpu<uchar, float3>, 0, 0, reprojectImageTo3D_gpu<short, float3>},
         {reprojectImageTo3D_gpu<uchar, float4>, 0, 0, reprojectImageTo3D_gpu<short, float4>}
@@ -533,32 +533,86 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s)
     integralBuffered(src, sum, buffer, s);
 }
 
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
 void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s)
 {
     CV_Assert(src.type() == CV_8UC1);
-    if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1)
-        sum.create(src.rows + 1, src.cols + 1, CV_32S);
 
-    NcvSize32u roiSize;
-    roiSize.width = src.cols;
-    roiSize.height = src.rows;
+    cudaStream_t stream = StreamAccessor::getStream(s);
 
-    cudaDeviceProp prop;
-    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+    DeviceInfo info;
 
-    Ncv32u bufSize;
-    ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+    if (info.supports(WARP_SHUFFLE_FUNCTIONS))
+    {
+        GpuMat src16;
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+        if (src.cols % 16 == 0)
+            src16 = src;
+        else
+        {
+            ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer);
 
-    NppStStreamHandler h(stream);
+            GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows));
+
+            if (s)
+            {
+                s.enqueueMemSet(buffer, Scalar::all(0));
+                s.enqueueCopy(src, inner);
+            }
+            else
+            {
+                buffer.setTo(Scalar::all(0));
+                src.copyTo(inner);
+            }
+
+            src16 = buffer;
+        }
 
-    ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
-        sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+        sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1);
 
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (s)
+            s.enqueueMemSet(sum, Scalar::all(0));
+        else
+            sum.setTo(Scalar::all(0));
+
+        GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows));
+
+        cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream);
+
+        if (src16.cols != src.cols)
+            sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1));
+    }
+    else
+    {
+        sum.create(src.rows + 1, src.cols + 1, CV_32SC1);
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            sum.ptr<Ncv32u>(), static_cast<int>(sum.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1340,7 +1394,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size
     int width = (result_size.width + 2) / 3;
     int height = (result_size.height + 2) / 3;
     width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);    
+    height = std::min(height, result_size.height);
     return Size(width, height);
 }
 
@@ -1380,7 +1434,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 
     cufftHandle planR2C, planC2R;
     cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));   
+    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
 
     cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
     cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
-- 
GitLab