提交 be0c20b7 编写于 作者: M marina.kolpakova

align grid by 4

上级 8579666b
...@@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device ...@@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device
#endif #endif
} }
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream) void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
{ {
{ {
// each thread handles 16 values, use 1 block/row // each thread handles 16 values, use 1 block/row
const int block = img.cols / 16; int block = img.cols / 16;
// save, becouse step is actually can't be less 512 bytes
int align = img.cols % 4;
if ( align != 0)
{
block += (4 - align);
}
// launch 1 block / row // launch 1 block / row
const int grid = img.rows; const int grid = img.rows;
cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) ); cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral); shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
......
...@@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device ...@@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
{ {
namespace imgproc namespace imgproc
{ {
void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream); void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
} }
}}} }}}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册