From 324fa26848f1020d125bd45e1fa5459c07fb092a Mon Sep 17 00:00:00 2001 From: Erik Karlsson Date: Fri, 6 Mar 2015 19:07:13 +0100 Subject: [PATCH] Refactoring of OpenCL implementation --- modules/photo/src/denoising.cpp | 6 ++- .../src/fast_nlmeans_denoising_opencl.hpp | 38 ++++++++++++------- modules/photo/src/opencl/nlmeans.cl | 31 ++++++++------- 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index 29899f7910..30f638d4c8 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -51,7 +51,8 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h, Size src_size = _src.size(); CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) && src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes - ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, false)) + ocl_fastNlMeansDenoising(_src, _dst, &h, 1, + templateWindowSize, searchWindowSize, false)) Mat src = _src.getMat(); _dst.create(src_size, src.type()); @@ -95,7 +96,8 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h, Size src_size = _src.size(); CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) && src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes - ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, true)) + ocl_fastNlMeansDenoising(_src, _dst, &h, 1, + templateWindowSize, searchWindowSize, true)) Mat src = _src.getMat(); _dst.create(src_size, src.type()); diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 2fa11a351d..a06dc61922 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -29,7 +29,7 @@ static int divUp(int a, int b) } template -static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, +static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT *h, int hn, int cn, int & almostTemplateWindowSizeSqBinShift, bool abs) { const WT maxEstimateSumValue = searchWindowSize * searchWindowSize * @@ -53,24 +53,32 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow int maxDist = abs ? std::numeric_limits::max() * cn : std::numeric_limits::max() * std::numeric_limits::max() * cn; int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); - FT den = 1.0f / (h * h * cn); + FT den[4]; + CV_Assert(hn > 0 && hn <= 4); + for (int i=0; i 1 ? format("%d", hn).c_str() : "").c_str(), + depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) : + format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(), depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) : - (sprintf(buf[0], "long%d", cn), buf[0]), + format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(), depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) : - (sprintf(buf[1], "convert_long%d", cn), buf[1]), + format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(), BLOCK_COLS, BLOCK_ROWS, ctaSize, templateWindowHalfWize, searchWindowHalfSize, ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, @@ -115,13 +127,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, if ((depth == CV_8U && !ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, - h, cn, + h, hn, cn, almostTemplateWindowSizeSqBinShift, abs)) || (depth == CV_16U && !ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, - h, cn, + h, hn, cn, almostTemplateWindowSizeSqBinShift, abs))) return false; diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 11837a5fcd..936aed6fa8 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -20,9 +20,9 @@ #ifdef OP_CALC_WEIGHTS -__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, +__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist, FT almostDist2ActualDistMultiplier, int fixedPointMult, - FT den, FT WEIGHT_THRESHOLD) + w_t den, FT WEIGHT_THRESHOLD) { int almostDist = get_global_id(0); @@ -30,14 +30,13 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost { FT dist = almostDist * almostDist2ActualDistMultiplier; #ifdef ABS - int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den)); + w_t w = exp((w_t)(-dist*dist) * den); #else - int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); + w_t w = exp((w_t)(-dist) * den); #endif - if (weight < WEIGHT_THRESHOLD * fixedPointMult) - weight = 0; - - almostDist2Weight[almostDist] = weight; + wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w)); + almostDist2Weight[almostDist] = + weight < WEIGHT_THRESHOLD * fixedPointMult ? (wlut_t)0 : weight; } } @@ -208,14 +207,14 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset } inline void convolveWindow(__global const uchar * src, int src_step, int src_offset, - __local int * dists, __global const int * almostDist2Weight, + __local int * dists, __global const wlut_t * almostDist2Weight, __global uchar * dst, int dst_step, int dst_offset, int y, int x, int id, __local weight_t * weights_local, __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) { int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; - weight_t weights = 0; - sum_t weighted_sum = (sum_t)(0); + weight_t weights = (weight_t)0; + sum_t weighted_sum = (sum_t)0; for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE) { @@ -223,10 +222,10 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index)); int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; - int weight = almostDist2Weight[almostAvgDist]; + weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]); - weights += (weight_t)weight; - weighted_sum += (sum_t)(weight) * src_value; + weights += weight; + weighted_sum += (sum_t)weight * src_value; } weights_local[id] = weights; @@ -251,13 +250,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off weighted_sum_local[2] + weighted_sum_local[3]; weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; - *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0)); + *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0); } } __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __global const int * almostDist2Weight, __global uchar * buffer, + __global const wlut_t * almostDist2Weight, __global uchar * buffer, int almostTemplateWindowSizeSqBinShift) { int block_x = get_group_id(0), nblocks_x = get_num_groups(0); -- GitLab