diff --git a/modules/imgproc/perf/opencl/perf_imgwarp.cpp b/modules/imgproc/perf/opencl/perf_imgwarp.cpp index 21788260411af04110a7e66627475dea1d2fa9ce..211e2bed636846695caa9c5d0d48c4b3c8283ad5 100644 --- a/modules/imgproc/perf/opencl/perf_imgwarp.cpp +++ b/modules/imgproc/perf/opencl/perf_imgwarp.cpp @@ -172,7 +172,7 @@ typedef tuple RemapParams; typedef TestBaseWithParam RemapFixture; OCL_PERF_TEST_P(RemapFixture, Remap, - ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, InterType::all())) + ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, InterType::all())) { const RemapParams params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index a8c0a846a009ffd6d1cb0f9f1a2a64cbdc955f8d..affe5c3fdca2b10b80e4c32555cd5fb2073493ed 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -3503,7 +3503,7 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input { int cn = _src.channels(), type = _src.type(), depth = _src.depth(); - if (borderType == BORDER_TRANSPARENT || cn == 3 || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) + if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1) return false; @@ -3553,10 +3553,16 @@ static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, Input ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]), ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2))); } + int scalarcn = cn == 3 ? 4 : cn; + int sctype = CV_MAKETYPE(depth, scalarcn); + buildOptions += format(" -D T=%s -D T1=%s" + " -D cn=%d -D ST=%s", + ocl::typeToStr(type), ocl::typeToStr(depth), + cn, ocl::typeToStr(sctype)); ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions); - Mat scalar(1, 1, type, borderValue); + Mat scalar(1, 1, sctype, borderValue); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst), map1arg = ocl::KernelArg::ReadOnlyNoSize(map1), scalararg = ocl::KernelArg::Constant((void*)scalar.data, scalar.elemSize()); diff --git a/modules/imgproc/src/opencl/remap.cl b/modules/imgproc/src/opencl/remap.cl index aaa9dc371b40c934e5f90865cb3c1f74b554caff..bd043c5e4b4e7e9994ed3c48ca2854cfc8c0879b 100644 --- a/modules/imgproc/src/opencl/remap.cl +++ b/modules/imgproc/src/opencl/remap.cl @@ -53,6 +53,18 @@ #define noconvert +#if cn != 3 +#define loadpix(addr) *(__global const T*)(addr) +#define storepix(val, addr) *(__global T*)(addr) = val +#define TSIZE ((int)sizeof(T)) +#define convertScalar(a) (a) +#else +#define loadpix(addr) vload3(0, (__global const T1*)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr)) +#define TSIZE ((int)sizeof(T1)*3) +#define convertScalar(a) (T)(a.x, a.y, a.z) +#endif + enum { INTER_BITS = 5, @@ -70,7 +82,7 @@ enum #define EXTRAPOLATE(v2, v) \ { \ v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), (int2)(0)); \ - v = convertToWT(*((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * (int)sizeof(T) + src_offset)))); \ + v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \ } #elif defined BORDER_WRAP #define EXTRAPOLATE(v2, v) \ @@ -84,7 +96,7 @@ enum v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \ if( v2.y >= src_rows ) \ v2.y %= src_rows; \ - v = convertToWT(*((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * (int)sizeof(T) + src_offset)))); \ + v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \ } #elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) #ifdef BORDER_REFLECT @@ -118,7 +130,7 @@ enum v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \ } \ while (v2.y >= src_rows || v2.y < 0); \ - v = convertToWT(*((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * (int)sizeof(T) + src_offset)))); \ + v = convertToWT(loadpix((__global const T*)(srcptr + mad24(v2.y, src_step, v2.x * TSIZE + src_offset)))); \ } #else #error No extrapolation method @@ -132,16 +144,18 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * map1ptr, int map1_step, int map1_offset, __global const uchar * map2ptr, int map2_step, int map2_offset, - T scalar) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); + T scalar = convertScalar(nVal); + if (x < dst_cols && y < dst_rows) { int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset); int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset); - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); __global const float * map1 = (__global const float *)(map1ptr + map1_index); __global const float * map2 = (__global const float *)(map2ptr + map2_index); @@ -155,12 +169,14 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src #ifndef BORDER_CONSTANT int2 gxy = (int2)(gx, gy); #endif - EXTRAPOLATE(gxy, dst[0]) + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); } else { - int src_index = mad24(gy, src_step, gx * (int)sizeof(T) + src_offset); - dst[0] = *((__global const T*)(srcptr + src_index)); + int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); + storepix(loadpix((__global const T*)(srcptr + src_index)), dst); } } } @@ -168,14 +184,16 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * mapptr, int map_step, int map_offset, - T scalar) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); + T scalar = convertScalar(nVal); + if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset); __global const float2 * map = (__global const float2 *)(mapptr + map_index); @@ -185,11 +203,15 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o int gx = gxy.x, gy = gxy.y; if (NEED_EXTRAPOLATION(gx, gy)) - EXTRAPOLATE(gxy, dst[0]) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } else { - int src_index = mad24(gy, src_step, gx * (int)sizeof(T) + src_offset); - dst[0] = *((__global const T *)(srcptr + src_index)); + int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); } } } @@ -197,14 +219,16 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * mapptr, int map_step, int map_offset, - T scalar) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); + T scalar = convertScalar(nVal); + if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map_index = mad24(y, map_step, x * (int)sizeof(short2) + map_offset); __global const short2 * map = (__global const short2 *)(mapptr + map_index); @@ -214,11 +238,15 @@ __kernel void remap_16SC2(__global const uchar * srcptr, int src_step, int src_o int gx = gxy.x, gy = gxy.y; if (NEED_EXTRAPOLATION(gx, gy)) - EXTRAPOLATE(gxy, dst[0]) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } else { - int src_index = mad24(gy, src_step, gx * (int)sizeof(T) + src_offset); - dst[0] = *((__global const T *)(srcptr + src_index)); + int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); } } } @@ -227,14 +255,16 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * map1ptr, int map1_step, int map1_offset, __global const uchar * map2ptr, int map2_step, int map2_offset, - T scalar) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); + T scalar = convertScalar(nVal); + if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset); int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset); @@ -249,11 +279,15 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int int gx = gxy.x, gy = gxy.y; if (NEED_EXTRAPOLATION(gx, gy)) - EXTRAPOLATE(gxy, dst[0]) + { + T v; + EXTRAPOLATE(gxy, v) + storepix(v, dst); + } else { - int src_index = mad24(gy, src_step, gx * (int)sizeof(T) + src_offset); - dst[0] = *((__global const T *)(srcptr + src_index)); + int src_index = mad24(gy, src_step, gx * TSIZE + src_offset); + storepix(loadpix((__global const T *)(srcptr + src_index)), dst); } } } @@ -264,14 +298,14 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * map1ptr, int map1_step, int map1_offset, __global const uchar * map2ptr, int map2_step, int map2_offset, - T nVal) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map1_index = mad24(y, map1_step, x * (int)sizeof(short2) + map1_offset); int map2_index = mad24(y, map2_step, x * (int)sizeof(ushort) + map2_offset); @@ -287,26 +321,26 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int ushort map2Value = (ushort)(map2[0] & (INTER_TAB_SIZE2 - 1)); WT2 u = (WT2)(map2Value & (INTER_TAB_SIZE - 1), map2Value >> INTER_BITS) / (WT2)(INTER_TAB_SIZE); - WT scalar = convertToWT(nVal); + WT scalar = convertToWT(convertScalar(nVal)); WT a = scalar, b = scalar, c = scalar, d = scalar; if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(*((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * (int)sizeof(T) + src_offset)))); + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataA, a); if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(*((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * (int)sizeof(T) + src_offset)))); + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataB, b); if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(*((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * (int)sizeof(T) + src_offset)))); + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataC, c); if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(*((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * (int)sizeof(T) + src_offset)))); + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataD, d); @@ -314,7 +348,7 @@ __kernel void remap_16SC2_16UC1(__global const uchar * srcptr, int src_step, int b * (u.x) * (1 - u.y) + c * (1 - u.x) * (u.y) + d * (u.x) * (u.y); - dst[0] = convertToT(dst_data); + storepix(convertToT(dst_data), dst); } } @@ -322,14 +356,14 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * map1ptr, int map1_step, int map1_offset, __global const uchar * map2ptr, int map2_step, int map2_offset, - T nVal) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map1_index = mad24(y, map1_step, x * (int)sizeof(float) + map1_offset); int map2_index = mad24(y, map2_step, x * (int)sizeof(float) + map2_offset); @@ -346,26 +380,26 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src float2 _u = map_data - convert_float2(map_dataA); WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; - WT scalar = convertToWT(nVal); + WT scalar = convertToWT(convertScalar(nVal)); WT a = scalar, b = scalar, c = scalar, d = scalar; if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(*((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * (int)sizeof(T) + src_offset)))); + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataA, a); if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(*((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * (int)sizeof(T) + src_offset)))); + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataB, b); if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(*((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * (int)sizeof(T) + src_offset)))); + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataC, c); if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(*((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * (int)sizeof(T) + src_offset)))); + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataD, d); @@ -373,21 +407,21 @@ __kernel void remap_2_32FC1(__global const uchar * srcptr, int src_step, int src b * (u.x) * (1 - u.y) + c * (1 - u.x) * (u.y) + d * (u.x) * (u.y); - dst[0] = convertToT(dst_data); + storepix(convertToT(dst_data), dst); } } __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, __global const uchar * mapptr, int map_step, int map_offset, - T nVal) + ST nVal) { int x = get_global_id(0); int y = get_global_id(1); if (x < dst_cols && y < dst_rows) { - int dst_index = mad24(y, dst_step, x * (int)sizeof(T) + dst_offset); + int dst_index = mad24(y, dst_step, x * TSIZE + dst_offset); int map_index = mad24(y, map_step, x * (int)sizeof(float2) + map_offset); __global const float2 * map = (__global const float2 *)(mapptr + map_index); @@ -401,26 +435,26 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o float2 _u = map_data - convert_float2(map_dataA); WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)INTER_TAB_SIZE)) / (WT2)INTER_TAB_SIZE; - WT scalar = convertToWT(nVal); + WT scalar = convertToWT(convertScalar(nVal)); WT a = scalar, b = scalar, c = scalar, d = scalar; if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) - a = convertToWT(*((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * (int)sizeof(T) + src_offset)))); + a = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataA.y, src_step, map_dataA.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataA, a); if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) - b = convertToWT(*((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * (int)sizeof(T) + src_offset)))); + b = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataB.y, src_step, map_dataB.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataB, b); if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) - c = convertToWT(*((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * (int)sizeof(T) + src_offset)))); + c = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataC.y, src_step, map_dataC.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataC, c); if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) - d = convertToWT(*((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * (int)sizeof(T) + src_offset)))); + d = convertToWT(loadpix((__global const T *)(srcptr + mad24(map_dataD.y, src_step, map_dataD.x * TSIZE + src_offset)))); else EXTRAPOLATE(map_dataD, d); @@ -428,7 +462,7 @@ __kernel void remap_32FC2(__global const uchar * srcptr, int src_step, int src_o b * (u.x) * (1 - u.y) + c * (1 - u.x) * (u.y) + d * (u.x) * (u.y); - dst[0] = convertToT(dst_data); + storepix(convertToT(dst_data), dst); } } diff --git a/modules/imgproc/test/ocl/test_warp.cpp b/modules/imgproc/test/ocl/test_warp.cpp index d5c75f0dad7af6b3961acdaf52ad9a16aba0532f..3963b671a2274edcac1f1ce1105ca8c23a308dcd 100644 --- a/modules/imgproc/test/ocl/test_warp.cpp +++ b/modules/imgproc/test/ocl/test_warp.cpp @@ -342,7 +342,7 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarpResizeArea, Resize, Combine( OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine( Values(CV_8U, CV_16U, CV_32F), - Values(1, 4), + Values(1, 3, 4), Values(std::pair((MatType)CV_32FC1, (MatType)CV_32FC1), std::pair((MatType)CV_16SC2, (MatType)CV_16UC1), std::pair((MatType)CV_32FC2, noType)), @@ -355,7 +355,7 @@ OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_LINEAR, Combine( OCL_INSTANTIATE_TEST_CASE_P(ImgprocWarp, Remap_INTER_NEAREST, Combine( Values(CV_8U, CV_16U, CV_32F), - Values(1, 4), + Values(1, 3, 4), Values(std::pair((MatType)CV_32FC1, (MatType)CV_32FC1), std::pair((MatType)CV_32FC2, noType), std::pair((MatType)CV_16SC2, (MatType)CV_16UC1),