From e45f92a9e12fbd1031280eea01ad76fcbb4612eb Mon Sep 17 00:00:00 2001 From: pengxiao Date: Wed, 4 Sep 2013 16:38:36 +0800 Subject: [PATCH] Fix a bug of ocl retina on NVIDIA platform. --- .../bioinspired/src/opencl/retina_kernel.cl | 59 ++++++++++++++----- modules/bioinspired/src/retina_ocl.cpp | 2 +- modules/bioinspired/test/test_retina_ocl.cpp | 2 +- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/modules/bioinspired/src/opencl/retina_kernel.cl b/modules/bioinspired/src/opencl/retina_kernel.cl index 6da4219d9c..1eac50324a 100644 --- a/modules/bioinspired/src/opencl/retina_kernel.cl +++ b/modules/bioinspired/src/opencl/retina_kernel.cl @@ -114,19 +114,34 @@ kernel void horizontalAnticausalFilter( global float * optr = output + mad24(gid + 1, elements_per_row, - 1 + out_offset / 4); - float4 result = (float4)(0), out_v4; + float4 result_v4 = (float4)(0), out_v4; + float result = 0; // we assume elements_per_row is multple of 4 - for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4) + for(int i = 0; i < 4; ++ i, -- optr) + { + if(i < elements_per_row - cols) + { + *optr = result; + } + else + { + result = *optr + _a * result; + *optr = result; + } + } + result_v4.x = result; + optr -= 3; + for(int i = 1; i < elements_per_row / 4; ++i, optr -= 4) { // shift left, `offset` is type `size_t` so it cannot be negative - out_v4 = vload4(0, optr - 3); + out_v4 = vload4(0, optr); - result.w = out_v4.w + _a * result.x; - result.z = out_v4.z + _a * result.w; - result.y = out_v4.y + _a * result.z; - result.x = out_v4.x + _a * result.y; + result_v4.w = out_v4.w + _a * result_v4.x; + result_v4.z = out_v4.z + _a * result_v4.w; + result_v4.y = out_v4.y + _a * result_v4.z; + result_v4.x = out_v4.x + _a * result_v4.y; - vstore4(result, 0, optr - 3); + vstore4(result_v4, 0, optr); } } @@ -207,18 +222,34 @@ kernel void horizontalAnticausalFilter_Irregular( buffer + mad24(rows - gid, elements_per_row, -1 + buffer_offset / 4); float4 buf_v4, out_v4, res_v4 = (float4)(0); - - for(int i = 0; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4) - { - buf_v4 = vload4(0, bptr - 3); - out_v4 = vload4(0, optr - 3); + float result = 0; + // we assume elements_per_row is multple of 4 + for(int i = 0; i < 4; ++ i, -- optr, -- bptr) + { + if(i < elements_per_row - cols) + { + *optr = result; + } + else + { + result = *optr + *bptr * result; + *optr = result; + } + } + res_v4.x = result; + optr -= 3; + bptr -= 3; + for(int i = 0; i < elements_per_row / 4 - 1; ++i, optr -= 4, bptr -= 4) + { + buf_v4 = vload4(0, bptr); + out_v4 = vload4(0, optr); res_v4.w = out_v4.w + buf_v4.w * res_v4.x; res_v4.z = out_v4.z + buf_v4.z * res_v4.w; res_v4.y = out_v4.y + buf_v4.y * res_v4.z; res_v4.x = out_v4.x + buf_v4.x * res_v4.y; - vstore4(res_v4, 0, optr - 3); + vstore4(res_v4, 0, optr); } } diff --git a/modules/bioinspired/src/retina_ocl.cpp b/modules/bioinspired/src/retina_ocl.cpp index 8f1f2694ef..ca94bc8fb6 100644 --- a/modules/bioinspired/src/retina_ocl.cpp +++ b/modules/bioinspired/src/retina_ocl.cpp @@ -1149,7 +1149,7 @@ void RetinaColor::_initColorSampling() // computing photoreceptors local density MAKE_OCLMAT_SLICES(_RGBmosaic, 3); MAKE_OCLMAT_SLICES(_colorLocalDensity, 3); - + _colorLocalDensity.setTo(0); _spatiotemporalLPfilter(_RGBmosaic_slices[0], _colorLocalDensity_slices[0]); _spatiotemporalLPfilter(_RGBmosaic_slices[1], _colorLocalDensity_slices[1]); _spatiotemporalLPfilter(_RGBmosaic_slices[2], _colorLocalDensity_slices[2]); diff --git a/modules/bioinspired/test/test_retina_ocl.cpp b/modules/bioinspired/test/test_retina_ocl.cpp index a732d7e37e..b09ce50366 100644 --- a/modules/bioinspired/test/test_retina_ocl.cpp +++ b/modules/bioinspired/test/test_retina_ocl.cpp @@ -49,7 +49,7 @@ #include "opencv2/imgproc.hpp" #include "opencv2/highgui.hpp" -#if defined(HAVE_OPENCV_OCL) && defined(HAVE_OPENCL) +#if defined(HAVE_OPENCV_OCL) #include "opencv2/ocl.hpp" #define RETINA_ITERATIONS 5 -- GitLab