diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl index 184fcfbb10dfef34f082cb0c3702418cdfe8355d..1919e8edd26d622877bbc3cb3e7ee2219f662b5d 100644 --- a/modules/imgproc/src/opencl/match_template.cl +++ b/modules/imgproc/src/opencl/match_template.cl @@ -29,9 +29,13 @@ // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. -#define DATA_SIZE ((int)sizeof(type)) -#define ELEM_TYPE elem_type -#define ELEM_SIZE ((int)sizeof(elem_type)) +#if cn != 3 +#define loadpix(addr) *(__global const T *)(addr) +#define TSIZE (int)sizeof(T) +#else +#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#endif #define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset)) #define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset)) @@ -70,14 +74,6 @@ inline float normAcc_SQDIFF(float num, float denum) #error "cn should be 1-4" #endif -#if cn != 3 -#define loadpix(addr) *(__global const T *)(addr) -#define TSIZE (int)sizeof(T) -#else -#define loadpix(addr) vload3(0, (__global const T1 *)(addr)) -#define TSIZE ((int)sizeof(T1)*3) -#endif - #ifdef CALC_SUM __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset, @@ -123,37 +119,102 @@ __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offse dst[0] = convertToDT(localmem[0]); } +#elif defined FIRST_CHANNEL + +__kernel void extractFirstChannel( const __global uchar* img, int img_step, int img_offset, + __global uchar* res, int res_step, int res_offset, int rows, int cols) +{ + int x = get_global_id(0); + int y = get_global_id(1)*PIX_PER_WI_Y; + + if(x < cols ) + { + #pragma unroll + for (int cy=0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y) + { + T1 image = *(__global const T1*)(img + mad24(y, img_step, mad24(x, (int)sizeof(T1)*cn, img_offset)));; + int res_idx = mad24(y, res_step, mad24(x, (int)sizeof(float), res_offset)); + *(__global float *)(res + res_idx) = image; + } + } +} + #elif defined CCORR -#if cn==3 +#if cn==1 && PIX_PER_WI_X==4 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset, __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) { - int x = get_global_id(0); + int x0 = get_global_id(0)*PIX_PER_WI_X; int y = get_global_id(1); - if (x < dst_cols && y < dst_rows) + if (y < dst_rows) { - WT sum = (WT)(0); - - for (int i = 0; i < template_rows; ++i) + if (x0 + PIX_PER_WI_X <= dst_cols) { - for (int j = 0; j < template_cols; ++j) + WT sum = (WT)(0); + + int ind = mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset)); + __global const T1 * template = (__global const T1*)(templateptr + template_offset); + + for (int i = 0; i < template_rows; ++i) { - T src = vload3(0, (__global const T1 *)(srcptr + mad24(y+i, src_step, mad24(x+j, (int)sizeof(T1)*cn, src_offset)))); - T template = vload3(0, (__global const T1 *)(templateptr + mad24(i, template_step, mad24(j, (int)sizeof(T1)*cn, template_offset)))); + for (int j = 0; j < template_cols; ++j) + { + T temp = (T)(template[j]); + T src = *(__global const T*)(srcptr + ind + j*(int)sizeof(T1)); #if wdepth == 4 - sum = mad24(convertToWT(src), convertToWT(template), sum); + sum = mad24(convertToWT(src), convertToWT(temp), sum); #else - sum = mad(convertToWT(src), convertToWT(template), sum); + sum = mad(convertToWT(src), convertToWT(temp), sum); #endif + } + ind += src_step; + template = (__global const T1 *)((__global const uchar *)template + template_step); } + + T temp = (T)(template[0]); + int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset)); + *(__global float4 *)(dst + dst_idx) = convert_float4(sum); } + else + { + WT1 sum [PIX_PER_WI_X]; + #pragma unroll + for (int i=0; i < PIX_PER_WI_X; i++) sum[i] = 0; - int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - *(__global float *)(dst + dst_idx) = convertToDT(sum); + __global const T1 * src = (__global const T1 *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset))); + __global const T1 * template = (__global const T1 *)(templateptr + template_offset); + + for (int i = 0; i < template_rows; ++i) + { + for (int j = 0; j < template_cols; ++j) + { + #pragma unroll + for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x) + { + +#if wdepth == 4 + sum[cx] = mad24(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]); +#else + sum[cx] = mad(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]); +#endif + } + } + + src = (__global const T1 *)((__global const uchar *)src + src_step); + template = (__global const T1 *)((__global const uchar *)template + template_step); + } + + #pragma unroll + for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0) + { + int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset)); + *(__global float *)(dst + dst_idx) = convertToDT(sum[cx]); + } + } } } @@ -170,20 +231,18 @@ __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_s { WT sum = (WT)(0); - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); - __global const T * template = (__global const T *)(templateptr + template_offset); - for (int i = 0; i < template_rows; ++i) { for (int j = 0; j < template_cols; ++j) + { + T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); + T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); #if wdepth == 4 - sum = mad24(convertToWT(src[j]), convertToWT(template[j]), sum); + sum = mad24(convertToWT(src), convertToWT(template), sum); #else - sum = mad(convertToWT(src[j]), convertToWT(template[j]), sum); + sum = mad(convertToWT(src), convertToWT(template), sum); #endif - - src = (__global const T *)((__global const uchar *)src + src_step); - template = (__global const T *)((__global const uchar *)template + template_step); + } } int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); @@ -218,8 +277,6 @@ __kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int #elif defined SQDIFF -#if cn==3 - __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset, __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) @@ -235,8 +292,8 @@ __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_ { for (int j = 0; j < template_cols; ++j) { - T src = vload3(0, (__global const T1 *)(srcptr + mad24(y+i, src_step, mad24(x+j, (int)sizeof(T1)*cn, src_offset)))); - T template = vload3(0, (__global const T1 *)(templateptr + mad24(i, template_step, mad24(j, (int)sizeof(T1)*cn, template_offset)))); + T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); + T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); value = convertToWT(src) - convertToWT(template); #if wdepth == 4 @@ -252,45 +309,32 @@ __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_ } } -#else +#elif defined SQDIFF_PREPARED -__kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset, - __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, - __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) +__kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + int template_rows, int template_cols, __global const float * template_sqsum) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - __global const T * src = (__global const T *)(srcptr + mad24(y, src_step, mad24(x, (int)sizeof(T), src_offset))); - __global const T * template = (__global const T *)(templateptr + template_offset); - - WT sum = (WT)(0), value; - - for (int i = 0; i < template_rows; ++i) - { - for (int j = 0; j < template_cols; ++j) - { - value = convertToWT(src[j]) - convertToWT(template[j]); -#if wdepth == 4 - sum = mad24(value, value, sum); -#else - sum = mad(value, value, sum); -#endif - } + src_sqsums_step /= sizeof(float); + src_sqsums_offset /= sizeof(float); - src = (__global const T *)((__global const uchar *)src + src_step); - template = (__global const T *)((__global const uchar *)template + template_step); - } + __global const float * sqsum = (__global const float *)(src_sqsums); + float image_sqsum_ = (float)( + (sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) - + (sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)])); + float template_sqsum_value = template_sqsum[0]; int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - *(__global float *)(dst + dst_idx) = convertToDT(sum); + __global float * dstult = (__global float *)(dst + dst_idx); + *dstult = image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value; } } -#endif - #elif defined SQDIFF_NORMED __kernel void matchTemplate_SQDIFF_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, @@ -330,42 +374,18 @@ __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int if (x < dst_cols && y < dst_rows) { - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); - - src_sums_step /= ELEM_SIZE; - src_sums_offset /= ELEM_SIZE; - float image_sum_ = (float)((sum[SUMS_PTR(template_cols, template_rows)] - sum[SUMS_PTR(template_cols, 0)])- - (sum[SUMS_PTR(0, template_rows)] - sum[SUMS_PTR(0, 0)])) * template_sum; - - int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - __global float * dstult = (__global float *)(dst + dst_idx); - *dstult -= image_sum_; - } -} - -#elif cn == 2 - -__kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, - __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int template_rows, int template_cols, float template_sum_0, float template_sum_1) -{ - int x = get_global_id(0); - int y = get_global_id(1); + __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); - if (x < dst_cols && y < dst_rows) - { - src_sums_step /= ELEM_SIZE; - src_sums_offset /= ELEM_SIZE; + int step = src_sums_step/(int)sizeof(T); - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); + T image_sum = (T)(0), value; - float image_sum_ = template_sum_0 * (float)((sum[SUMS_PTR(template_cols, template_rows)] - sum[SUMS_PTR(template_cols, 0)]) -(sum[SUMS_PTR(0, template_rows)] - sum[SUMS_PTR(0, 0)])); - image_sum_ += template_sum_1 * (float)((sum[SUMS_PTR(template_cols, template_rows)+1] - sum[SUMS_PTR(template_cols, 0)+1])-(sum[SUMS_PTR(0, template_rows)+1] - sum[SUMS_PTR(0, 0)+1])); + value = (T)(sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]); + image_sum = mad(value, template_sum , image_sum); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - __global float * dstult = (__global float *)(dst+dst_idx); - *dstult -= image_sum_; + *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); } } @@ -373,62 +393,61 @@ __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int template_rows, int template_cols, float template_sum_0, float template_sum_1, float template_sum_2) + int template_rows, int template_cols, float4 template_sum) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - src_sums_step /= ELEM_SIZE; - src_sums_offset /= ELEM_SIZE; + T image_sum = (T)(0), value, temp_sum; - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); + temp_sum.x = template_sum.x; + temp_sum.y = template_sum.y; + temp_sum.z = template_sum.z; - int c_r = SUMS_PTR(template_cols, template_rows); - int c_o = SUMS_PTR(template_cols, 0); - int o_r = SUMS_PTR(0,template_rows); - int oo = SUMS_PTR(0, 0); + value = vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, template_rows))); + value -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, template_rows))); + value -= vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, 0))); + value += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0))); - float image_sum_ = template_sum_0 * (float)((sum[c_r] - sum[c_o]) -(sum[o_r] - sum[oo])); - image_sum_ += template_sum_1 * (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[oo+1])); - image_sum_ += template_sum_2 * (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[oo+2])); + image_sum = mad(value, temp_sum , 0); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - __global float * dstult = (__global float *)(dst+dst_idx); - *dstult -= image_sum_; + *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); } } -#elif cn == 4 +#elif (cn==2 || cn==4) __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int template_rows, int template_cols, float template_sum_0, float template_sum_1, float template_sum_2, float template_sum_3) + int template_rows, int template_cols, float4 template_sum) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - src_sums_step /= ELEM_SIZE; - src_sums_offset /= ELEM_SIZE; + __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); + + int step = src_sums_step/(int)sizeof(T); + + T image_sum = (T)(0), value, temp_sum; - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); +#if cn==2 + temp_sum.x = template_sum.x; + temp_sum.y = template_sum.y; +#else + temp_sum = template_sum; +#endif - int c_r = SUMS_PTR(template_cols, template_rows); - int c_o = SUMS_PTR(template_cols, 0); - int o_r = SUMS_PTR(0,template_rows); - int oo = SUMS_PTR(0, 0); + value = (sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]); - float image_sum_ = template_sum_0 * (float)((sum[c_r] - sum[c_o]) -(sum[o_r] - sum[oo])); - image_sum_ += template_sum_1 * (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[oo+1])); - image_sum_ += template_sum_2 * (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[oo+2])); - image_sum_ += template_sum_3 * (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[oo+3])); + image_sum = mad(value, temp_sum , image_sum); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - __global float * dstult = (__global float *)(dst+dst_idx); - *dstult -= image_sum_; + *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); } } @@ -448,62 +467,24 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s int x = get_global_id(0); int y = get_global_id(1); - if (x < dst_cols && y < dst_rows) - { - src_sums_offset /= ELEM_SIZE; - src_sums_step /= ELEM_SIZE; - src_sqsums_step /= sizeof(float); - src_sqsums_offset /= sizeof(float); - - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); - __global float * sqsum = (__global float*)(src_sqsums); - - float image_sum_ = (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)]) - - (sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)])); - - float image_sqsum_ = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)]) - - (sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)])); - - int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); - __global float * dstult = (__global float *)(dst+dst_idx); - *dstult = normAcc((*dstult) - image_sum_ * template_sum, - sqrt(template_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); - } -} - -#elif cn == 2 - -__kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, - __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, - __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int t_rows, int t_cols, float weight, float template_sum_0, float template_sum_1, float template_sqsum) -{ - int x = get_global_id(0); - int y = get_global_id(1); - float sum_[2]; float sqsum_[2]; + if (x < dst_cols && y < dst_rows) { - src_sums_offset /= ELEM_SIZE; - src_sums_step /= ELEM_SIZE; - src_sqsums_step /= sizeof(float); - src_sqsums_offset /= sizeof(float); - - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); - __global float * sqsum = (__global float*)(src_sqsums); + int step = src_sums_step/(int)sizeof(T); - sum_[0] = (float)((sum[SUMS_PTR(t_cols, t_rows)] - sum[SUMS_PTR(t_cols, 0)])-(sum[SUMS_PTR(0, t_rows)] - sum[SUMS_PTR(0, 0)])); - sum_[1] = (float)((sum[SUMS_PTR(t_cols, t_rows)+1] - sum[SUMS_PTR(t_cols, 0)+1])-(sum[SUMS_PTR(0, t_rows)+1] - sum[SUMS_PTR(0, 0)+1])); + __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); + __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset))); - sqsum_[0] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)] - sqsum[SQSUMS_PTR(t_cols, 0)])-(sqsum[SQSUMS_PTR(0, t_rows)] - sqsum[SQSUMS_PTR(0, 0)])); - sqsum_[1] = (float)((sqsum[SQSUMS_PTR(t_cols, t_rows)+1] - sqsum[SQSUMS_PTR(t_cols, 0)+1])-(sqsum[SQSUMS_PTR(0, t_rows)+1] - sqsum[SQSUMS_PTR(0, 0)+1])); + T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0]; + T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0]; - float num = sum_[0]*template_sum_0 + sum_[1]*template_sum_1; + float num = convertToDT(mad(value_sum, template_sum, 0)); - float denum = sqrt( template_sqsum * (sqsum_[0] - weight * sum_[0]* sum_[0] + - sqsum_[1] - weight * sum_[1]* sum_[1])); + value_sqsum -= weight * value_sum * value_sum; + float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); __global float * dstult = (__global float *)(dst+dst_idx); @@ -516,49 +497,35 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int t_rows, int t_cols, float weight, float template_sum_0, float template_sum_1, float template_sum_2, - float template_sqsum) + int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum) { int x = get_global_id(0); int y = get_global_id(1); - float sum_[3]; - float sqsum_[3]; - if (x < dst_cols && y < dst_rows) { - src_sums_offset /= ELEM_SIZE; - src_sums_step /= ELEM_SIZE; - src_sqsums_step /= sizeof(float); - src_sqsums_offset /= sizeof(float); - - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); - __global float * sqsum = (__global float*)(src_sqsums); + int step = src_sums_step/(int)sizeof(T); - int c_r = SUMS_PTR(t_cols, t_rows); - int c_o = SUMS_PTR(t_cols, 0); - int o_r = SUMS_PTR(0, t_rows); - int o_o = SUMS_PTR(0, 0); + T temp_sum, value_sum, value_sqsum; - sum_[0] = (float)((sum[c_r] - sum[c_o]) -(sum[o_r] - sum[o_o ])); - sum_[1] = (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[o_o +1])); - sum_[2] = (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[o_o +2])); + temp_sum.x = template_sum.x; + temp_sum.y = template_sum.y; + temp_sum.z = template_sum.z; - c_r = SQSUMS_PTR(t_cols, t_rows); - c_o = SQSUMS_PTR(t_cols, 0); - o_r = SQSUMS_PTR(0, t_rows); - o_o = SQSUMS_PTR(0, 0); + value_sum = vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, t_rows))); + value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, t_rows))); + value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, 0))); + value_sum += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0))); - sqsum_[0] = (float)((sqsum[c_r] - sqsum[c_o]) -(sqsum[o_r] - sqsum[o_o])); - sqsum_[1] = (float)((sqsum[c_r+1] - sqsum[c_o+1])-(sqsum[o_r+1] - sqsum[o_o+1])); - sqsum_[2] = (float)((sqsum[c_r+2] - sqsum[c_o+2])-(sqsum[o_r+2] - sqsum[o_o+2])); + value_sqsum = vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, t_rows))); + value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, t_rows))); + value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, 0))); + value_sqsum += vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, 0))); - float num = sum_[0]*template_sum_0 + sum_[1]*template_sum_1 + sum_[2]*template_sum_2; + float num = convertToDT(mad(value_sum, temp_sum, 0)); - float denum = sqrt( template_sqsum * ( - sqsum_[0] - weight * sum_[0]* sum_[0] + - sqsum_[1] - weight * sum_[1]* sum_[1] + - sqsum_[2] - weight * sum_[2]* sum_[2] )); + value_sqsum -= weight * value_sum * value_sum; + float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); __global float * dstult = (__global float *)(dst+dst_idx); @@ -566,58 +533,39 @@ __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int s } } -#elif cn == 4 +#elif (cn==2 || cn==4) __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - int t_rows, int t_cols, float weight, - float template_sum_0, float template_sum_1, float template_sum_2, float template_sum_3, - float template_sqsum) + int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum) { int x = get_global_id(0); int y = get_global_id(1); - float sum_[4]; - float sqsum_[4]; - if (x < dst_cols && y < dst_rows) { - src_sums_offset /= ELEM_SIZE; - src_sums_step /= ELEM_SIZE; - src_sqsums_step /= sizeof(float); - src_sqsums_offset /= sizeof(float); - - __global ELEM_TYPE* sum = (__global ELEM_TYPE*)(src_sums); - __global float * sqsum = (__global float*)(src_sqsums); + int step = src_sums_step/(int)sizeof(T); - int c_r = SUMS_PTR(t_cols, t_rows); - int c_o = SUMS_PTR(t_cols, 0); - int o_r = SUMS_PTR(0, t_rows); - int o_o = SUMS_PTR(0, 0); + T temp_sum; - sum_[0] = (float)((sum[c_r] - sum[c_o]) -(sum[o_r] - sum[o_o ])); - sum_[1] = (float)((sum[c_r+1] - sum[c_o+1])-(sum[o_r+1] - sum[o_o +1])); - sum_[2] = (float)((sum[c_r+2] - sum[c_o+2])-(sum[o_r+2] - sum[o_o +2])); - sum_[3] = (float)((sum[c_r+3] - sum[c_o+3])-(sum[o_r+3] - sum[o_o +3])); + __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); + __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset))); - c_r = SQSUMS_PTR(t_cols, t_rows); - c_o = SQSUMS_PTR(t_cols, 0); - o_r = SQSUMS_PTR(0, t_rows); - o_o = SQSUMS_PTR(0, 0); + T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0]; + T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0]; - sqsum_[0] = (float)((sqsum[c_r] - sqsum[c_o]) -(sqsum[o_r] - sqsum[o_o])); - sqsum_[1] = (float)((sqsum[c_r+1] - sqsum[c_o+1])-(sqsum[o_r+1] - sqsum[o_o+1])); - sqsum_[2] = (float)((sqsum[c_r+2] - sqsum[c_o+2])-(sqsum[o_r+2] - sqsum[o_o+2])); - sqsum_[3] = (float)((sqsum[c_r+3] - sqsum[c_o+3])-(sqsum[o_r+3] - sqsum[o_o+3])); +#if cn==2 + temp_sum.x = template_sum.x; + temp_sum.y = template_sum.y; +#else + temp_sum = template_sum; +#endif - float num = sum_[0]*template_sum_0 + sum_[1]*template_sum_1 + sum_[2]*template_sum_2 + sum_[3]*template_sum_3; + float num = convertToDT(mad(value_sum, temp_sum, 0)); - float denum = sqrt( template_sqsum * ( - sqsum_[0] - weight * sum_[0]* sum_[0] + - sqsum_[1] - weight * sum_[1]* sum_[1] + - sqsum_[2] - weight * sum_[2]* sum_[2] + - sqsum_[3] - weight * sum_[3]* sum_[3] )); + value_sqsum -= weight * value_sum * value_sum; + float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); __global float * dstult = (__global float *)(dst+dst_idx); diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp index ebc4e3f4648153015459841a138d2c7ca268c5aa..164af425e35925e0c91ba3b03dfaaf6075343807 100644 --- a/modules/imgproc/src/templmatch.cpp +++ b/modules/imgproc/src/templmatch.cpp @@ -56,6 +56,26 @@ enum SUM_1 = 0, SUM_2 = 1 }; +static bool extractFirstChannel_32F(InputArray _image, OutputArray _result, int cn) +{ + int depth = _image.depth(); + + ocl::Device dev = ocl::Device::getDefault(); + int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; + + ocl::Kernel k("extractFirstChannel", ocl::imgproc::match_template_oclsrc, format("-D FIRST_CHANNEL -D T1=%s -D cn=%d -D PIX_PER_WI_Y=%d", + ocl::typeToStr(depth), cn, pxPerWIy)); + if (k.empty()) + return false; + + UMat image = _image.getUMat(); + UMat result = _result.getUMat(); + + + size_t globalsize[2] = {result.cols, (result.rows+pxPerWIy-1)/pxPerWIy}; + return k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::WriteOnly(result)).run( 2, globalsize, NULL, false); +} + static bool sumTemplate(InputArray _src, UMat & result) { int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); @@ -88,15 +108,181 @@ static bool sumTemplate(InputArray _src, UMat & result) return k.run(1, &globalsize, &wgs, false); } +static bool useNaive(Size size) +{ + if (!ocl::Device::getDefault().isIntel()) + return true; + + int dft_size = 18; + return size.height < dft_size && size.width < dft_size; + +} + +struct ConvolveBuf +{ + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + + UMat image_spect, templ_spect, result_spect; + UMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size); +}; + +void ConvolveBuf::create(Size image_size, Size templ_size) +{ + result_size = Size(image_size.width - templ_size.width + 1, + image_size.height - templ_size.height + 1); + + block_size = user_block_size; + if (user_block_size.width == 0 || user_block_size.height == 0) + block_size = estimateBlockSize(result_size); + + dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); + dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); + + dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1); + dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1); + + // To avoid wasting time doing small DFTs + dft_size.width = std::max(dft_size.width, 512); + dft_size.height = std::max(dft_size.height, 512); + + image_block.create(dft_size, CV_32F); + templ_block.create(dft_size, CV_32F); + result_data.create(dft_size, CV_32F); + + image_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); + templ_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); + result_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2); + + // Use maximum result matrix block size for the estimated DFT block size + block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); + block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); +} + +Size ConvolveBuf::estimateBlockSize(Size result_size) +{ + int width = (result_size.width + 2) / 3; + int height = (result_size.height + 2) / 3; + width = std::min(width, result_size.width); + height = std::min(height, result_size.height); + return Size(width, height); +} + +static bool convolve_dft(InputArray _image, InputArray _templ, OutputArray _result) +{ + ConvolveBuf buf; + CV_Assert(_image.type() == CV_32F); + CV_Assert(_templ.type() == CV_32F); + + buf.create(_image.size(), _templ.size()); + _result.create(buf.result_size, CV_32F); + + UMat image = _image.getUMat(); + UMat templ = _templ.getUMat(); + + UMat result = _result.getUMat(); + + Size& block_size = buf.block_size; + Size& dft_size = buf.dft_size; + + UMat& image_block = buf.image_block; + UMat& templ_block = buf.templ_block; + UMat& result_data = buf.result_data; + + UMat& image_spect = buf.image_spect; + UMat& templ_spect = buf.templ_spect; + UMat& result_spect = buf.result_spect; + + UMat templ_roi = templ; + copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, BORDER_ISOLATED); + + dft(templ_block, templ_spect, 0); + + // Process all blocks of the result matrix + for (int y = 0; y < result.rows; y += block_size.height) + { + for (int x = 0; x < result.cols; x += block_size.width) + { + Size image_roi_size(std::min(x + dft_size.width, image.cols) - x, + std::min(y + dft_size.height, image.rows) - y); + Rect roi0(x, y, image_roi_size.width, image_roi_size.height); + + UMat image_roi(image, roi0); + + copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, + 0, image_block.cols - image_roi.cols, BORDER_ISOLATED); + + dft(image_block, image_spect, 0); + + mulSpectrums(image_spect, templ_spect, result_spect, 0, true); + + dft(result_spect, result_data, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT | cv::DFT_SCALE); + + Size result_roi_size(std::min(x + block_size.width, result.cols) - x, + std::min(y + block_size.height, result.rows) - y); + + Rect roi1(x, y, result_roi_size.width, result_roi_size.height); + Rect roi2(0, 0, result_roi_size.width, result_roi_size.height); + + UMat result_roi(result, roi1); + UMat result_block(result_data, roi2); + + result_block.copyTo(result_roi); + } + } + return true; +} + +static bool convolve_32F(InputArray _image, InputArray _templ, OutputArray _result) +{ + _result.create(_image.rows() - _templ.rows() + 1, _image.cols() - _templ.cols() + 1, CV_32F); + + if (_image.channels() == 1) + return(convolve_dft(_image, _templ, _result)); + else + { + UMat image = _image.getUMat(); + UMat templ = _templ.getUMat(); + UMat result_(image.rows-templ.rows+1,(image.cols-templ.cols+1)*image.channels(), CV_32F); + bool ok = convolve_dft(image.reshape(1), templ.reshape(1), result_); + if (ok==false) + return false; + UMat result = _result.getUMat(); + return (extractFirstChannel_32F(result_, _result, _image.channels())); + } +} + static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result) { int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKE_TYPE(wdepth, cn); + int wdepth = CV_32F, wtype = CV_MAKE_TYPE(wdepth, cn); + + ocl::Device dev = ocl::Device::getDefault(); + int pxPerWIx = (cn==1 && dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1; + int rated_cn = cn; + int wtype1 = wtype; + + if (pxPerWIx!=1) + { + rated_cn = pxPerWIx; + type = CV_MAKE_TYPE(depth, rated_cn); + wtype1 = CV_MAKE_TYPE(wdepth, rated_cn); + } char cvt[40]; + char cvt1[40]; + const char* convertToWT1 = ocl::convertTypeStr(depth, wdepth, cn, cvt); + const char* convertToWT = ocl::convertTypeStr(depth, wdepth, rated_cn, cvt1); + ocl::Kernel k("matchTemplate_Naive_CCORR", ocl::imgproc::match_template_oclsrc, - format("-D CCORR -D T=%s -D T1=%s -D WT=%s -D convertToWT=%s -D cn=%d -D wdepth=%d", ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), - ocl::convertTypeStr(depth, wdepth, cn, cvt), cn, wdepth)); + format("-D CCORR -D T=%s -D T1=%s -D WT=%s -D WT1=%s -D convertToWT=%s -D convertToWT1=%s -D cn=%d -D wdepth=%d -D PIX_PER_WI_X=%d", ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype1), ocl::typeToStr(wtype), + convertToWT, convertToWT1, cn, wdepth, pxPerWIx)); if (k.empty()) return false; @@ -107,10 +293,33 @@ static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, Outpu k.args(ocl::KernelArg::ReadOnlyNoSize(image), ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::WriteOnly(result)); - size_t globalsize[2] = { result.cols, result.rows }; + size_t globalsize[2] = { (result.cols+pxPerWIx-1)/pxPerWIx, result.rows}; return k.run(2, globalsize, NULL, false); } + +static bool matchTemplate_CCORR(InputArray _image, InputArray _templ, OutputArray _result) +{ + if (useNaive(_templ.size())) + return( matchTemplateNaive_CCORR(_image, _templ, _result)); + else + { + if(_image.depth() == CV_8U) + { + UMat imagef, templf; + UMat image = _image.getUMat(); + UMat templ = _templ.getUMat(); + image.convertTo(imagef, CV_32F); + templ.convertTo(templf, CV_32F); + return(convolve_32F(imagef, templf, _result)); + } + else + { + return(convolve_32F(_image, _templ, _result)); + } + } +} + static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, OutputArray _result) { matchTemplate(_image, _templ, _result, CV_TM_CCORR); @@ -145,7 +354,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result) { int type = _image.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); - int wdepth = std::max(depth, CV_32S), wtype = CV_MAKE_TYPE(wdepth, cn); + int wdepth = CV_32F, wtype = CV_MAKE_TYPE(wdepth, cn); char cvt[40]; ocl::Kernel k("matchTemplate_Naive_SQDIFF", ocl::imgproc::match_template_oclsrc, @@ -165,6 +374,41 @@ static bool matchTemplateNaive_SQDIFF(InputArray _image, InputArray _templ, Outp return k.run(2, globalsize, NULL, false); } +static bool matchTemplate_SQDIFF(InputArray _image, InputArray _templ, OutputArray _result) +{ + if (useNaive(_templ.size())) + return( matchTemplateNaive_SQDIFF(_image, _templ, _result)); + else + { + matchTemplate(_image, _templ, _result, CV_TM_CCORR); + + int type = _image.type(), cn = CV_MAT_CN(type); + + ocl::Kernel k("matchTemplate_Prepared_SQDIFF", ocl::imgproc::match_template_oclsrc, + format("-D SQDIFF_PREPARED -D T=%s -D cn=%d", ocl::typeToStr(type), cn)); + if (k.empty()) + return false; + + UMat image = _image.getUMat(), templ = _templ.getUMat(); + _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + UMat result = _result.getUMat(); + + UMat image_sums, image_sqsums; + integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F); + + UMat templ_sqsum; + if (!sumTemplate(_templ, templ_sqsum)) + return false; + + k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result), + templ.rows, templ.cols, ocl::KernelArg::PtrReadOnly(templ_sqsum)); + + size_t globalsize[2] = { result.cols, result.rows }; + + return k.run(2, globalsize, NULL, false); + } +} + static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, OutputArray _result) { matchTemplate(_image, _templ, _result, CV_TM_CCORR); @@ -202,47 +446,31 @@ static bool matchTemplate_CCOEFF(InputArray _image, InputArray _templ, OutputArr matchTemplate(_image, _templ, _result, CV_TM_CCORR); UMat image_sums, temp; - integral(_image, temp); - - if (temp.depth() == CV_64F) - temp.convertTo(image_sums, CV_32F); - else - image_sums = temp; + integral(_image, image_sums, CV_32F); int type = image_sums.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); ocl::Kernel k("matchTemplate_Prepared_CCOEFF", ocl::imgproc::match_template_oclsrc, - format("-D CCOEFF -D T=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn)); + format("-D CCOEFF -D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn)); if (k.empty()) return false; - UMat templ = _templ.getUMat(); - Size size = _image.size(), tsize = templ.size(); - _result.create(size.height - templ.rows + 1, size.width - templ.cols + 1, CV_32F); + UMat templ = _templ.getUMat(); UMat result = _result.getUMat(); + Size tsize = templ.size(); - if (cn == 1) + if (cn==1) { float templ_sum = static_cast(sum(_templ)[0]) / tsize.area(); - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), - templ.rows, templ.cols, templ_sum); + k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, templ_sum); } else { Vec4f templ_sum = Vec4f::all(0); templ_sum = sum(templ) / tsize.area(); - if (cn == 2) - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, - templ_sum[0], templ_sum[1]); - else if (cn==3) - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, - templ_sum[0], templ_sum[1], templ_sum[2]); - else - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, - templ_sum[0], templ_sum[1], templ_sum[2], templ_sum[3]); - } + k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, templ_sum); } size_t globalsize[2] = { result.cols, result.rows }; return k.run(2, globalsize, NULL, false); @@ -258,7 +486,7 @@ static bool matchTemplate_CCOEFF_NORMED(InputArray _image, InputArray _templ, Ou int type = image_sums.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); ocl::Kernel k("matchTemplate_CCOEFF_NORMED", ocl::imgproc::match_template_oclsrc, - format("-D CCOEFF_NORMED -D type=%s -D elem_type=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn)); + format("-D CCOEFF_NORMED -D T=%s -D T1=%s -D cn=%d", ocl::typeToStr(type), ocl::typeToStr(depth), cn)); if (k.empty()) return false; @@ -308,19 +536,9 @@ static bool matchTemplate_CCOEFF_NORMED(InputArray _image, InputArray _templ, Ou return true; } - if (cn == 2) - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums), - ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale, - templ_sum[0], templ_sum[1], templ_sqsum_sum); - else if (cn == 3) - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums), - ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale, - templ_sum[0], templ_sum[1], templ_sum[2], templ_sqsum_sum); - else - k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums), + k.args(ocl::KernelArg::ReadOnlyNoSize(image_sums), ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result), templ.rows, templ.cols, scale, - templ_sum[0], templ_sum[1], templ_sum[2], templ_sum[3], templ_sqsum_sum); - } + templ_sum, templ_sqsum_sum); } size_t globalsize[2] = { result.cols, result.rows }; return k.run(2, globalsize, NULL, false); @@ -339,7 +557,7 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _ static const Caller callers[] = { - matchTemplateNaive_SQDIFF, matchTemplate_SQDIFF_NORMED, matchTemplateNaive_CCORR, + matchTemplate_SQDIFF, matchTemplate_SQDIFF_NORMED, matchTemplate_CCORR, matchTemplate_CCORR_NORMED, matchTemplate_CCOEFF, matchTemplate_CCOEFF_NORMED }; const Caller caller = callers[method];