align grid by 4

be0c20b7 · marina.kolpakova · 8579666b · be0c20b7 · be0c20b7
隐藏空白更改
内联并排

Showing with 11 addition and 4 deletion

modules/gpu/src/cuda/integral_image.cu modules/gpu/src/cuda/integral_image.cu +10 -3

modules/gpu/src/imgproc.cpp modules/gpu/src/imgproc.cpp +1 -1

未找到文件。
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -357,18 +357,25 @@ namespace cv { namespace gpu { namespace device
        #endif
        }
-        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
        {
            {
                // each thread handles 16 values, use 1 block/row
-                const int block = img.cols / 16;
+                int block = img.cols / 16;
+                // save, becouse step is actually can't be less 512 bytes
+                int align = img.cols % 4;
+                if ( align != 0)
+                {
+                    block += (4 - align);
+                }
                // launch 1 block / row
                const int grid = img.rows;
                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
                cudaSafeCall( cudaGetLastError() );
            }

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -537,7 +537,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
    }
 }}}