From da5aaab27788d32a9f4118b4a90667000c8282aa Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 22 Aug 2012 12:11:11 +0400 Subject: [PATCH] optimized gpu::integral for Kepler --- modules/core/include/opencv2/core/gpumat.hpp | 4 +- modules/gpu/src/cuda/integral_image.cu | 385 +++++++++++++++++++ modules/gpu/src/imgproc.cpp | 92 ++++- 3 files changed, 461 insertions(+), 20 deletions(-) create mode 100644 modules/gpu/src/cuda/integral_image.cu diff --git a/modules/core/include/opencv2/core/gpumat.hpp b/modules/core/include/opencv2/core/gpumat.hpp index 9893359253..e09f1bc4c3 100644 --- a/modules/core/include/opencv2/core/gpumat.hpp +++ b/modules/core/include/opencv2/core/gpumat.hpp @@ -72,9 +72,11 @@ namespace cv { namespace gpu FEATURE_SET_COMPUTE_13 = 13, FEATURE_SET_COMPUTE_20 = 20, FEATURE_SET_COMPUTE_21 = 21, + FEATURE_SET_COMPUTE_30 = 30, GLOBAL_ATOMICS = FEATURE_SET_COMPUTE_11, SHARED_ATOMICS = FEATURE_SET_COMPUTE_12, - NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13 + NATIVE_DOUBLE = FEATURE_SET_COMPUTE_13, + WARP_SHUFFLE_FUNCTIONS = FEATURE_SET_COMPUTE_30 }; // Gives information about what GPU archs this OpenCV GPU module was diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu new file mode 100644 index 0000000000..ead0ddefec --- /dev/null +++ b/modules/gpu/src/cuda/integral_image.cu @@ -0,0 +1,385 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "opencv2/gpu/device/common.hpp" + +namespace cv { namespace gpu { namespace device +{ + namespace imgproc + { + // Utility function to extract unsigned chars from an unsigned integer + __device__ uchar4 int_to_uchar4(unsigned int in) + { + uchar4 bytes; + bytes.x = (in && 0x000000ff) >> 0; + bytes.y = (in && 0x0000ff00) >> 8; + bytes.z = (in && 0x00ff0000) >> 16; + bytes.w = (in && 0xff000000) >> 24; + return bytes; + } + + __global__ void shfl_integral_horizontal(const PtrStep_ img, PtrStep_ integral) + { + #if __CUDA_ARCH__ >= 300 + __shared__ int sums[128]; + + const int id = threadIdx.x; + const int lane_id = id % warpSize; + const int warp_id = id / warpSize; + + const uint4 data = img(blockIdx.x, id); + + const uchar4 a = int_to_uchar4(data.x); + const uchar4 b = int_to_uchar4(data.y); + const uchar4 c = int_to_uchar4(data.z); + const uchar4 d = int_to_uchar4(data.w); + + int result[16]; + + result[0] = a.x; + result[1] = result[0] + a.y; + result[2] = result[1] + a.z; + result[3] = result[2] + a.w; + + result[4] = result[3] + b.x; + result[5] = result[4] + b.y; + result[6] = result[5] + b.z; + result[7] = result[6] + b.w; + + result[8] = result[7] + c.x; + result[9] = result[8] + c.y; + result[10] = result[9] + c.z; + result[11] = result[10] + c.w; + + result[12] = result[11] + d.x; + result[13] = result[12] + d.y; + result[14] = result[13] + d.z; + result[15] = result[14] + d.w; + + int sum = result[15]; + + // the prefix sum for each thread's 16 value is computed, + // now the final sums (result[15]) need to be shared + // with the other threads and add. To do this, + // the __shfl_up() instruction is used and a shuffle scan + // operation is performed to distribute the sums to the correct + // threads + #pragma unroll + for (int i = 1; i < 32; i *= 2) + { + const int n = __shfl_up(sum, i, 32); + + if (lane_id >= i) + { + #pragma unroll + for (int i = 0; i < 16; ++i) + result[i] += n; + + sum += n; + } + } + + // Now the final sum for the warp must be shared + // between warps. This is done by each warp + // having a thread store to shared memory, then + // having some other warp load the values and + // compute a prefix sum, again by using __shfl_up. + // The results are uniformly added back to the warps. + // last thread in the warp holding sum of the warp + // places that in shared + if (threadIdx.x % warpSize == warpSize - 1) + sums[warp_id] = result[15]; + + __syncthreads(); + + if (warp_id == 0) + { + int warp_sum = sums[lane_id]; + + #pragma unroll + for (int i = 1; i <= 32; i *= 2) + { + const int n = __shfl_up(warp_sum, i, 32); + + if (lane_id >= i) + warp_sum += n; + } + + sums[lane_id] = warp_sum; + } + + __syncthreads(); + + int blockSum = 0; + + // fold in unused warp + if (warp_id > 0) + { + blockSum = sums[warp_id - 1]; + + #pragma unroll + for (int i = 0; i < 16; ++i) + result[i] += blockSum; + } + + // assemble result + // Each thread has 16 values to write, which are + // now integer data (to avoid overflow). Instead of + // each thread writing consecutive uint4s, the + // approach shown here experiments using + // the shuffle command to reformat the data + // inside the registers so that each thread holds + // consecutive data to be written so larger contiguous + // segments can be assembled for writing. + + /* + For example data that needs to be written as + + GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 + but is stored in registers (r0..r3), in four threads (0..3) as: + + threadId 0 1 2 3 + r0 x0 y0 z0 w0 + r1 x1 y1 z1 w1 + r2 x2 y2 z2 w2 + r3 x3 y3 z3 w3 + + after apply __shfl_xor operations to move data between registers r1..r3: + + threadId 00 01 10 11 + x0 y0 z0 w0 + xor(01)->y1 x1 w1 z1 + xor(10)->z2 w2 x2 y2 + xor(11)->w3 z3 y3 x3 + + and now x0..x3, and z0..z3 can be written out in order by all threads. + + In the current code, each register above is actually representing + four integers to be written as uint4's to GMEM. + */ + + result[4] = __shfl_xor(result[4] , 1, 32); + result[5] = __shfl_xor(result[5] , 1, 32); + result[6] = __shfl_xor(result[6] , 1, 32); + result[7] = __shfl_xor(result[7] , 1, 32); + + result[8] = __shfl_xor(result[8] , 2, 32); + result[9] = __shfl_xor(result[9] , 2, 32); + result[10] = __shfl_xor(result[10], 2, 32); + result[11] = __shfl_xor(result[11], 2, 32); + + result[12] = __shfl_xor(result[12], 3, 32); + result[13] = __shfl_xor(result[13], 3, 32); + result[14] = __shfl_xor(result[14], 3, 32); + result[15] = __shfl_xor(result[15], 3, 32); + + uint4* integral_row = integral.ptr(blockIdx.x); + uint4 output; + + /////// + + if (threadIdx.x % 4 == 0) + output = make_uint4(result[0], result[1], result[2], result[3]); + + if (threadIdx.x % 4 == 1) + output = make_uint4(result[4], result[5], result[6], result[7]); + + if (threadIdx.x % 4 == 2) + output = make_uint4(result[8], result[9], result[10], result[11]); + + if (threadIdx.x % 4 == 3) + output = make_uint4(result[12], result[13], result[14], result[15]); + + integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output; + + /////// + + if (threadIdx.x % 4 == 2) + output = make_uint4(result[0], result[1], result[2], result[3]); + + if (threadIdx.x % 4 == 3) + output = make_uint4(result[4], result[5], result[6], result[7]); + + if (threadIdx.x % 4 == 0) + output = make_uint4(result[8], result[9], result[10], result[11]); + + if (threadIdx.x % 4 == 1) + output = make_uint4(result[12], result[13], result[14], result[15]); + + integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output; + + // continuning from the above example, + // this use of __shfl_xor() places the y0..y3 and w0..w3 data + // in order. + + #pragma unroll + for (int i = 0; i < 16; ++i) + result[i] = __shfl_xor(result[i], 1, 32); + + if (threadIdx.x % 4 == 0) + output = make_uint4(result[0], result[1], result[2], result[3]); + + if (threadIdx.x % 4 == 1) + output = make_uint4(result[4], result[5], result[6], result[7]); + + if (threadIdx.x % 4 == 2) + output = make_uint4(result[8], result[9], result[10], result[11]); + + if (threadIdx.x % 4 == 3) + output = make_uint4(result[12], result[13], result[14], result[15]); + + integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output; + + /////// + + if (threadIdx.x % 4 == 2) + output = make_uint4(result[0], result[1], result[2], result[3]); + + if (threadIdx.x % 4 == 3) + output = make_uint4(result[4], result[5], result[6], result[7]); + + if (threadIdx.x % 4 == 0) + output = make_uint4(result[8], result[9], result[10], result[11]); + + if (threadIdx.x % 4 == 1) + output = make_uint4(result[12], result[13], result[14], result[15]); + + integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output; + #endif + } + + // This kernel computes columnwise prefix sums. When the data input is + // the row sums from above, this completes the integral image. + // The approach here is to have each block compute a local set of sums. + // First , the data covered by the block is loaded into shared memory, + // then instead of performing a sum in shared memory using __syncthreads + // between stages, the data is reformatted so that the necessary sums + // occur inside warps and the shuffle scan operation is used. + // The final set of sums from the block is then propgated, with the block + // computing "down" the image and adding the running sum to the local + // block sums. + __global__ void shfl_integral_vertical(DevMem2D_ integral) + { + #if __CUDA_ARCH__ >= 300 + __shared__ unsigned int sums[32][9]; + + const int tidx = blockIdx.x * blockDim.x + threadIdx.x; + const int lane_id = tidx % 8; + + if (tidx >= integral.cols) + return; + + sums[threadIdx.x][threadIdx.y] = 0; + __syncthreads(); + + unsigned int stepSum = 0; + + for (int y = threadIdx.y; y < integral.rows; y += blockDim.y) + { + unsigned int* p = integral.ptr(y) + tidx; + + unsigned int sum = *p; + + sums[threadIdx.x][threadIdx.y] = sum; + __syncthreads(); + + // place into SMEM + // shfl scan reduce the SMEM, reformating so the column + // sums are computed in a warp + // then read out properly + const int j = threadIdx.x % 8; + const int k = threadIdx.x / 8 + threadIdx.y * 4; + + int partial_sum = sums[k][j]; + + for (int i = 1; i <= 8; i *= 2) + { + int n = __shfl_up(partial_sum, i, 32); + + if (lane_id >= i) + partial_sum += n; + } + + sums[k][j] = partial_sum; + __syncthreads(); + + if (threadIdx.y > 0) + sum += sums[threadIdx.x][threadIdx.y - 1]; + + sum += stepSum; + stepSum += sums[threadIdx.x][blockDim.y - 1]; + + __syncthreads(); + + *p = sum; + } + #endif + } + + void shfl_integral_gpu(DevMem2Db img, DevMem2D_ integral, cudaStream_t stream) + { + { + // each thread handles 16 values, use 1 block/row + const int block = img.cols / 16; + + // launch 1 block / row + const int grid = img.rows; + + cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); + + shfl_integral_horizontal<<>>((DevMem2D_) img, (DevMem2D_) integral); + cudaSafeCall( cudaGetLastError() ); + } + + { + const dim3 block(32, 8); + const dim3 grid(divUp(integral.cols, block.x), 1); + + shfl_integral_vertical<<>>(integral); + cudaSafeCall( cudaGetLastError() ); + } + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } + } +}}} diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index fe1ad7bd89..5b8fb9c057 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -223,7 +223,7 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyz, const Mat& Q, using namespace cv::gpu::device::imgproc; typedef void (*func_t)(const DevMem2Db disp, DevMem2Db xyz, const float* q, cudaStream_t stream); - static const func_t funcs[2][4] = + static const func_t funcs[2][4] = { {reprojectImageTo3D_gpu, 0, 0, reprojectImageTo3D_gpu}, {reprojectImageTo3D_gpu, 0, 0, reprojectImageTo3D_gpu} @@ -533,32 +533,86 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, Stream& s) integralBuffered(src, sum, buffer, s); } +namespace cv { namespace gpu { namespace device +{ + namespace imgproc + { + void shfl_integral_gpu(DevMem2Db img, DevMem2D_ integral, cudaStream_t stream); + } +}}} + void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& s) { CV_Assert(src.type() == CV_8UC1); - if (sum.cols != src.cols + 1 && sum.rows != src.rows + 1) - sum.create(src.rows + 1, src.cols + 1, CV_32S); - NcvSize32u roiSize; - roiSize.width = src.cols; - roiSize.height = src.rows; + cudaStream_t stream = StreamAccessor::getStream(s); - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + DeviceInfo info; - Ncv32u bufSize; - ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); - ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); + if (info.supports(WARP_SHUFFLE_FUNCTIONS)) + { + GpuMat src16; - cudaStream_t stream = StreamAccessor::getStream(s); + if (src.cols % 16 == 0) + src16 = src; + else + { + ensureSizeIsEnough(src.rows, ((src.cols + 15) / 16) * 16, src.type(), buffer); - NppStStreamHandler h(stream); + GpuMat inner = buffer(Rect(0, 0, src.cols, src.rows)); + + if (s) + { + s.enqueueMemSet(buffer, Scalar::all(0)); + s.enqueueCopy(src, inner); + } + else + { + buffer.setTo(Scalar::all(0)); + src.copyTo(inner); + } + + src16 = buffer; + } - ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), - sum.ptr(), static_cast(sum.step), roiSize, buffer.ptr(), bufSize, prop) ); + sum.create(src16.rows + 1, src16.cols + 1, CV_32SC1); - if (stream == 0) - cudaSafeCall( cudaDeviceSynchronize() ); + if (s) + s.enqueueMemSet(sum, Scalar::all(0)); + else + sum.setTo(Scalar::all(0)); + + GpuMat inner = sum(Rect(1, 1, src16.cols, src16.rows)); + + cv::gpu::device::imgproc::shfl_integral_gpu(src16, inner, stream); + + if (src16.cols != src.cols) + sum = sum(Rect(0, 0, src.cols + 1, src.rows + 1)); + } + else + { + sum.create(src.rows + 1, src.cols + 1, CV_32SC1); + + NcvSize32u roiSize; + roiSize.width = src.cols; + roiSize.height = src.rows; + + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + + Ncv32u bufSize; + ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); + ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); + + + NppStStreamHandler h(stream); + + ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), static_cast(src.step), + sum.ptr(), static_cast(sum.step), roiSize, buffer.ptr(), bufSize, prop) ); + + if (stream == 0) + cudaSafeCall( cudaDeviceSynchronize() ); + } } ////////////////////////////////////////////////////////////////////////////// @@ -1340,7 +1394,7 @@ Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size int width = (result_size.width + 2) / 3; int height = (result_size.height + 2) / 3; width = std::min(width, result_size.width); - height = std::min(height, result_size.height); + height = std::min(height, result_size.height); return Size(width, height); } @@ -1380,7 +1434,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, cufftHandle planR2C, planC2R; cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); -- GitLab