diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index eb6860dac384ffc715f4851c4f4b883d09af040e..cf2c6bb294bbb68b3b1ea2b3b19b5cac3c013af5 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -97,7 +97,23 @@ static bool ippCanny(const Mat& _src, Mat& _dst, float low, float high) static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh, int aperture_size, bool L2gradient, int cn, const Size & size) { - UMat dx(size, CV_16SC(cn)), dy(size, CV_16SC(cn)); + UMat map; + + const ocl::Device &dev = ocl::Device::getDefault(); + int max_wg_size = (int)dev.maxWorkGroupSize(); + + int lSizeX = 32; + int lSizeY = max_wg_size / 32; + + if (lSizeY == 0) + { + lSizeX = 16; + lSizeY = max_wg_size / 16; + } + if (lSizeY == 0) + { + lSizeY = 1; + } if (L2gradient) { @@ -110,144 +126,103 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh *= high_thresh; } int low = cvFloor(low_thresh), high = cvFloor(high_thresh); - Size esize(size.width + 2, size.height + 2); - - UMat mag; - size_t globalsize[2] = { size.width, size.height }, localsize[2] = { 16, 16 }; if (aperture_size == 3 && !_src.isSubmatrix()) { - // Sobel calculation - char cvt[2][40]; - ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc, - format("-D OP_SOBEL -D cn=%d -D shortT=%s -D ucharT=%s" - " -D convertToIntT=%s -D intT=%s -D convertToShortT=%s", cn, - ocl::typeToStr(CV_16SC(cn)), - ocl::typeToStr(CV_8UC(cn)), - ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), - ocl::typeToStr(CV_32SC(cn)), - ocl::convertTypeStr(CV_32S, CV_16S, cn, cvt[1]))); - if (calcSobelRowPassKernel.empty()) + /* + stage1_with_sobel: + Sobel operator + Calc magnitudes + Non maxima suppression + Double thresholding + */ + char cvt[40]; + ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc, + format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_intN=%s -D intN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", + cn, ocl::memopTypeToStr(_src.depth()), + ocl::convertTypeStr(_src.type(), CV_32SC(cn), cn, cvt), + ocl::memopTypeToStr(CV_32SC(cn)), + lSizeX, lSizeY, + L2gradient ? " -D L2GRAD" : "")); + if (with_sobel.empty()) return false; - UMat src = _src.getUMat(), dxBuf(size, CV_16SC(cn)), dyBuf(size, CV_16SC(cn)); - calcSobelRowPassKernel.args(ocl::KernelArg::ReadOnly(src), - ocl::KernelArg::WriteOnlyNoSize(dxBuf), - ocl::KernelArg::WriteOnlyNoSize(dyBuf)); + UMat src = _src.getUMat(); + map.create(size, CV_32S); + with_sobel.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::WriteOnlyNoSize(map), + low, high); - if (!calcSobelRowPassKernel.run(2, globalsize, localsize, false)) - return false; - - // magnitude calculation - ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc, - format("-D cn=%d%s -D OP_MAG_BUF -D shortT=%s -D convertToIntT=%s -D intT=%s", - cn, L2gradient ? " -D L2GRAD" : "", - ocl::typeToStr(CV_16SC(cn)), - ocl::convertTypeStr(CV_16S, CV_32S, cn, cvt[0]), - ocl::typeToStr(CV_32SC(cn)))); - if (magnitudeKernel.empty()) - return false; - - mag = UMat(esize, CV_32SC1, Scalar::all(0)); - dx.create(size, CV_16SC(cn)); - dy.create(size, CV_16SC(cn)); + size_t globalsize[2] = { size.width, size.height }, + localsize[2] = { lSizeX, lSizeY }; - magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf), - ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy), - ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width); - - if (!magnitudeKernel.run(2, globalsize, localsize, false)) + if (!with_sobel.run(2, globalsize, localsize, false)) return false; } else { + /* + stage1_without_sobel: + Calc magnitudes + Non maxima suppression + Double thresholding + */ + UMat dx, dy; Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE); Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE); - // magnitude calculation - ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc, - format("-D OP_MAG -D cn=%d%s -D intT=int -D shortT=short -D convertToIntT=convert_int_sat", - cn, L2gradient ? " -D L2GRAD" : "")); - if (magnitudeKernel.empty()) + ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc, + format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s", + cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : "")); + if (without_sobel.empty()) return false; - mag = UMat(esize, CV_32SC1, Scalar::all(0)); - magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy), - ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width); + map.create(size, CV_32S); + without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy), + ocl::KernelArg::WriteOnly(map), + low, high); + + size_t globalsize[2] = { size.width, size.height }, + localsize[2] = { lSizeX, lSizeY }; - if (!magnitudeKernel.run(2, globalsize, NULL, false)) + if (!without_sobel.run(2, globalsize, localsize, false)) return false; } - // map calculation - ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc, - format("-D OP_MAP -D cn=%d", cn)); - if (calcMapKernel.empty()) - return false; - - UMat map(esize, CV_32SC1); - calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy), - ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map), - size.height, size.width, low, high); - - if (!calcMapKernel.run(2, globalsize, localsize, false)) - return false; + int PIX_PER_WI = 8; + /* + stage2: + hysteresis (add weak edges if they are connected with strong edges) + */ - // local hysteresis thresholding - ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc, - "-D OP_HYST_LOCAL"); - if (edgesHysteresisLocalKernel.empty()) - return false; + ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc, + format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI)); - UMat stack(1, size.area(), CV_16UC2), counter(1, 1, CV_32SC1, Scalar::all(0)); - edgesHysteresisLocalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack), - ocl::KernelArg::PtrReadWrite(counter), size.height, size.width); - if (!edgesHysteresisLocalKernel.run(2, globalsize, localsize, false)) + if (edgesHysteresis.empty()) return false; - // global hysteresis thresholding - UMat stack2(1, size.area(), CV_16UC2); - int count; - - for ( ; ; ) - { - ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc, - "-D OP_HYST_GLOBAL"); - if (edgesHysteresisGlobalKernel.empty()) - return false; - - { - Mat _counter = counter.getMat(ACCESS_RW); - count = _counter.at(0, 0); - if (count == 0) - break; - - _counter.at(0, 0) = 0; - } - - edgesHysteresisGlobalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack), - ocl::KernelArg::PtrReadWrite(stack2), ocl::KernelArg::PtrReadWrite(counter), - size.height, size.width, count); + edgesHysteresis.args(ocl::KernelArg::ReadWrite(map)); -#define divUp(total, grain) ((total + grain - 1) / grain) - size_t localsize2[2] = { 128, 1 }, globalsize2[2] = { std::min(count, 65535) * 128, divUp(count, 65535) }; -#undef divUp + int sizey = lSizeY / PIX_PER_WI; + if (sizey == 0) + sizey = 1; - if (!edgesHysteresisGlobalKernel.run(2, globalsize2, localsize2, false)) - return false; + size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey }; - std::swap(stack, stack2); - } + if (!edgesHysteresis.run(2, globalsize, localsize, false)) + return false; // get edges - ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, "-D OP_EDGES"); + + ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, + format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI)); if (getEdgesKernel.empty()) return false; _dst.create(size, CV_8UC1); UMat dst = _dst.getUMat(); - getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst)); + getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst)); return getEdgesKernel.run(2, globalsize, NULL, false); } diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl index 99cfc3b63734191e3bf6789a7d420fbafa3869c9..da2750e3487a2e21f923b1d7d04075735a4a520c 100644 --- a/modules/imgproc/src/opencl/canny.cl +++ b/modules/imgproc/src/opencl/canny.cl @@ -43,520 +43,433 @@ // //M*/ -#ifdef OP_SOBEL +#define TG22 0.4142135623730950488016887242097f +#define TG67 2.4142135623730950488016887242097f -#if cn != 3 -#define loadpix(addr) convertToIntT(*(__global const ucharT *)(addr)) -#define storepix(val, addr) *(__global shortT *)(addr) = convertToShortT(val) -#define shortSize (int)sizeof(shortT) +#ifdef WITH_SOBEL + +#if cn == 1 +#define loadpix(addr) convert_intN(*(__global const TYPE *)(addr)) #else -#define loadpix(addr) convertToIntT(vload3(0, (__global const uchar *)(addr))) -#define storepix(val, addr) vstore3(convertToShortT(val), 0, (__global short *)(addr)) -#define shortSize (int)sizeof(short) * cn +#define loadpix(addr) convert_intN(vload3(0, (__global const TYPE *)(addr))) #endif +#define storepix(value, addr) *(__global int *)(addr) = (int)(value) + +/* + stage1_with_sobel: + Sobel operator + Calc magnitudes + Non maxima suppression + Double thresholding +*/ + +__constant int prev[4][2] = { + { 0, -1 }, + { -1, 1 }, + { -1, 0 }, + { -1, -1 } +}; -// Smoothing perpendicular to the derivative direction with a triangle filter -// only support 3x3 Sobel kernel -// h (-1) = 1, h (0) = 2, h (1) = 1 -// h'(-1) = -1, h'(0) = 0, h'(1) = 1 -// thus sobel 2D operator can be calculated as: -// h'(x, y) = h'(x)h(y) for x direction -// -// src input 8bit single channel image data -// dx_buf output dx buffer -// dy_buf output dy buffer +__constant int next[4][2] = { + { 0, 1 }, + { 1, -1 }, + { 1, 0 }, + { 1, 1 } +}; -__kernel void calcSobelRowPass(__global const uchar * src, int src_step, int src_offset, int rows, int cols, - __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset, - __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset) +inline int3 sobel(int idx, __local const intN *smem) { - int gidx = get_global_id(0); - int gidy = get_global_id(1); + // result: x, y, mag + int3 res; - int lidx = get_local_id(0); - int lidy = get_local_id(1); + intN dx = smem[idx + 2] - smem[idx] + + 2 * (smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4]) + + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8]; - __local intT smem[16][18]; + intN dy = smem[idx] - smem[idx + 2 * GRP_SIZEX + 8] + + 2 * (smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9]) + + smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10]; - smem[lidy][lidx + 1] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(gidx, cn, src_offset))); - if (lidx == 0) +#ifdef L2GRAD + intN magN = dx * dx + dy * dy; +#else + intN magN = convert_intN(abs(dx) + abs(dy)); +#endif +#if cn == 1 + res.z = magN; + res.x = dx; + res.y = dy; +#else + res.z = max(magN.x, max(magN.y, magN.z)); + if (res.z == magN.y) { - smem[lidy][0] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(max(gidx - 1, 0), cn, src_offset))); - smem[lidy][17] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(min(gidx + 16, cols - 1), cn, src_offset))); + dx.x = dx.y; + dy.x = dy.y; } - barrier(CLK_LOCAL_MEM_FENCE); - - if (gidy < rows && gidx < cols) + else if (res.z == magN.z) { - storepix(smem[lidy][lidx + 2] - smem[lidy][lidx], - dx_buf + mad24(gidy, dx_buf_step, mad24(gidx, shortSize, dx_buf_offset))); - storepix(mad24(2, smem[lidy][lidx + 1], smem[lidy][lidx] + smem[lidy][lidx + 2]), - dy_buf + mad24(gidy, dy_buf_step, mad24(gidx, shortSize, dy_buf_offset))); + dx.x = dx.z; + dy.x = dy.z; } -} - -#elif defined OP_MAG_BUF || defined OP_MAG - -inline intT calc(shortT x, shortT y) -{ -#ifdef L2GRAD - intT intx = convertToIntT(x), inty = convertToIntT(y); - return intx * intx + inty * inty; -#else - return convertToIntT( (x >= (shortT)(0) ? x : -x) + (y >= (shortT)(0) ? y : -y) ); + res.x = dx.x; + res.y = dy.x; #endif -} -#ifdef OP_MAG + return res; +} -// calculate the magnitude of the filter pass combining both x and y directions -// This is the non-buffered version(non-3x3 sobel) -// -// dx_buf dx buffer, calculated from calcSobelRowPass -// dy_buf dy buffer, calculated from calcSobelRowPass -// dx direvitive in x direction output -// dy direvitive in y direction output -// mag magnitude direvitive of xy output - -__kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_offset, - __global const uchar * dyptr, int dy_step, int dy_offset, - __global uchar * magptr, int mag_step, int mag_offset, int rows, int cols) +__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols, + __global uchar *map, int map_step, int map_offset, + int low_thr, int high_thr) { - int x = get_global_id(0); - int y = get_global_id(1); - - if (y < rows && x < cols) - { - int dx_index = mad24(dx_step, y, mad24(x, (int)sizeof(short) * cn, dx_offset)); - int dy_index = mad24(dy_step, y, mad24(x, (int)sizeof(short) * cn, dy_offset)); - int mag_index = mad24(mag_step, y + 1, mad24(x + 1, (int)sizeof(int), mag_offset)); + __local intN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)]; - __global short * dx = (__global short *)(dxptr + dx_index); - __global short * dy = (__global short *)(dyptr + dy_index); - __global int * mag = (__global int *)(magptr + mag_index); + int lidx = get_local_id(0); + int lidy = get_local_id(1); - int cmag = calc(dx[0], dy[0]); -#if cn > 1 - short cx = dx[0], cy = dy[0]; - int pmag; + int start_x = GRP_SIZEX * get_group_id(0); + int start_y = GRP_SIZEY * get_group_id(1); - #pragma unroll - for (int i = 1; i < cn; ++i) - { - pmag = calc(dx[i], dy[i]); - if (pmag > cmag) - cmag = pmag, cx = dx[i], cy = dy[i]; - } - - dx[0] = cx, dy[0] = cy; -#endif - mag[0] = cmag; + int i = lidx + lidy * GRP_SIZEX; + for (int j = i; j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY) + { + int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1); + int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1); + smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset))); } -} -#elif defined OP_MAG_BUF + barrier(CLK_LOCAL_MEM_FENCE); -#if cn != 3 -#define loadpix(addr) *(__global const shortT *)(addr) -#define shortSize (int)sizeof(shortT) -#else -#define loadpix(addr) vload3(0, (__global const short *)(addr)) -#define shortSize (int)sizeof(short)*cn -#endif + //// Sobel, Magnitude + // -// calculate the magnitude of the filter pass combining both x and y directions -// This is the buffered version(3x3 sobel) -// -// dx_buf dx buffer, calculated from calcSobelRowPass -// dy_buf dy buffer, calculated from calcSobelRowPass -// dx direvitive in x direction output -// dy direvitive in y direction output -// mag magnitude direvitive of xy output -__kernel void calcMagnitude_buf(__global const uchar * dx_buf, int dx_buf_step, int dx_buf_offset, - __global const uchar * dy_buf, int dy_buf_step, int dy_buf_offset, - __global uchar * dx, int dx_step, int dx_offset, - __global uchar * dy, int dy_step, int dy_offset, - __global uchar * mag, int mag_step, int mag_offset, int rows, int cols) -{ - int gidx = get_global_id(0); - int gidy = get_global_id(1); + __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)]; - int lidx = get_local_id(0); - int lidy = get_local_id(1); + lidx++; + lidy++; - __local shortT sdx[18][16]; - __local shortT sdy[18][16]; - - sdx[lidy + 1][lidx] = loadpix(dx_buf + mad24(min(gidy, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset))); - sdy[lidy + 1][lidx] = loadpix(dy_buf + mad24(min(gidy, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset))); - if (lidy == 0) + if (i < GRP_SIZEX + 2) { - sdx[0][lidx] = loadpix(dx_buf + mad24(clamp(gidy - 1, 0, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset))); - sdx[17][lidx] = loadpix(dx_buf + mad24(min(gidy + 16, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset))); - - sdy[0][lidx] = loadpix(dy_buf + mad24(clamp(gidy - 1, 0, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset))); - sdy[17][lidx] = loadpix(dy_buf + mad24(min(gidy + 16, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset))); + int grp_sizey = min(GRP_SIZEY + 1, rows - start_y); + mag[i] = (sobel(i, smem)).z; + mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z; } + if (i < GRP_SIZEY + 2) + { + int grp_sizex = min(GRP_SIZEX + 1, cols - start_x); + mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z; + mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z; + } + + int idx = lidx + lidy * (GRP_SIZEX + 4); + i = lidx + lidy * (GRP_SIZEX + 2); + + int3 res = sobel(idx, smem); + mag[i] = res.z; + int x = res.x; + int y = res.y; + barrier(CLK_LOCAL_MEM_FENCE); - if (gidx < cols && gidy < rows) - { - shortT x = sdx[lidy + 1][lidx] * (shortT)(2) + sdx[lidy][lidx] + sdx[lidy + 2][lidx]; - shortT y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx]; + //// Threshold + Non maxima suppression + // -#if cn == 1 - *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = x; - *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = y; + /* + Sector numbers - *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = calc(x, y); -#elif cn == 3 - intT magv = calc(x, y); - short cx = x.x, cy = y.x; - int cmag = magv.x; + 3 2 1 + * * * + * * * + 0*******0 + * * * + * * * + 1 2 3 - if (cmag < magv.y) - cx = x.y, cy = y.y, cmag = magv.y; - if (cmag < magv.z) - cx = x.z, cy = y.z, cmag = magv.z; + We need to determine arctg(dy / dx) to one of the four directions: 0, 45, 90 or 135 degrees. + Therefore if abs(dy / dx) belongs to the interval + [0, tg(22.5)] -> 0 direction + [tg(22.5), tg(67.5)] -> 1 or 3 + [tg(67,5), +oo) -> 2 - *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = cx; - *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = cy; + Since tg(67.5) = 1 / tg(22.5), if we take + a = abs(dy / dx) * tg(22.5) and b = abs(dy / dx) * tg(67.5) + we can get another intervals - *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = cmag; -#endif - } -} + in case a: + [0, tg(22.5)^2] -> 0 + [tg(22.5)^2, 1] -> 1, 3 + [1, +oo) -> 2 -#endif + in case b: + [0, 1] -> 0 + [1, tg(67.5)^2] -> 1,3 + [tg(67.5)^2, +oo) -> 2 -#elif defined OP_MAP - -////////////////////////////////////////////////////////////////////////////////////////// -// 0.4142135623730950488016887242097 is tan(22.5) - -#define CANNY_SHIFT 15 -#define TG22 (int)(0.4142135623730950488016887242097f*(1<= cols || gidy >= rows) + return; - int tid = mad24(lidy, 16, lidx); - int lx = tid % 18; - int ly = tid / 18; + int mag0 = mag[i]; - mag += mag_offset; - if (ly < 14) - smem[ly][lx] = *(__global const int *)(mag + - mad24(mag_step, min(grp_idy + ly, rows - 1), (int)sizeof(int) * (grp_idx + lx))); - if (ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) - smem[ly + 14][lx] = *(__global const int *)(mag + - mad24(mag_step, min(grp_idy + ly + 14, rows - 1), (int)sizeof(int) * (grp_idx + lx))); - barrier(CLK_LOCAL_MEM_FENCE); - - if (gidy < rows && gidx < cols) + int value = 1; + if (mag0 > low_thr) { - // 0 - the pixel can not belong to an edge - // 1 - the pixel might belong to an edge - // 2 - the pixel does belong to an edge - int edge_type = 0; - int m = smem[lidy + 1][lidx + 1]; + int a = (y / (float)x) * TG22; + int b = (y / (float)x) * TG67; - if (m > low_thresh) - { - short xs = *(__global const short *)(dx + mad24(gidy, dx_step, mad24(gidx, (int)sizeof(short) * cn, dx_offset))); - short ys = *(__global const short *)(dy + mad24(gidy, dy_step, mad24(gidx, (int)sizeof(short) * cn, dy_offset))); - int x = abs(xs), y = abs(ys); + a = min((int)abs(a), 1) + 1; + b = min((int)abs(b), 1); - int tg22x = x * TG22; - y <<= CANNY_SHIFT; + // a = { 1, 2 } + // b = { 0, 1 } + // a * b = { 0, 1, 2 } - directions that we need ( + 3 if x ^ y < 0) - if (y < tg22x) - { - if (m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - int tg67x = tg22x + (x << (1 + CANNY_SHIFT)); - if (y > tg67x) - { - if (m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1]) - edge_type = 1 + (int)(m > high_thresh); - } - else - { - int s = (xs ^ ys) < 0 ? -1 : 1; - if (m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s]) - edge_type = 1 + (int)(m > high_thresh); - } - } + int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); // if a = 1, b = 1, dy ^ dx < 0 + int dir = a * b + 2 * dir3; + int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]]; + int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1); + + if (mag0 > prev_mag && mag0 >= next_mag) + { + value = (mag0 > high_thr) ? 2 : 0; } - *(__global int *)(map + mad24(map_step, gidy + 1, mad24(gidx + 1, (int)sizeof(int), + map_offset))) = edge_type; } + + storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset))); } -#undef CANNY_SHIFT -#undef TG22 +#elif defined WITHOUT_SOBEL -#elif defined OP_HYST_LOCAL +/* + stage1_without_sobel: + Calc magnitudes + Non maxima suppression + Double thresholding +*/ -struct PtrStepSz -{ - __global uchar * ptr; - int step, rows, cols; -}; +#define loadpix(addr) (__global short *)(addr) +#define storepix(val, addr) *(__global int *)(addr) = (int)(val) -inline int get(struct PtrStepSz data, int y, int x) -{ - return *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))); -} +#ifdef L2GRAD +#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y)) +#else +#define dist(x, y) (abs(x) + abs(y)) +#endif -inline void set(struct PtrStepSz data, int y, int x, int value) -{ - *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))) = value; -} +__constant int prev[4][2] = { + { 0, -1 }, + { -1, -1 }, + { -1, 0 }, + { -1, 1 } +}; -// perform Hysteresis for pixel whose edge type is 1 -// -// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and -// marked as edge. Each thread will iterate for 16 times to connect local edges. -// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will -// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel. -// -// map raw edge type results calculated from calcMap. -// stack the potiential edge points found in this kernel call -// counter the number of potiential edge points +__constant int next[4][2] = { + { 0, 1 }, + { 1, 1 }, + { 1, 0 }, + { 1, -1 } +}; -__kernel void edgesHysteresisLocal(__global uchar * map_ptr, int map_step, int map_offset, - __global ushort2 * st, __global unsigned int * counter, - int rows, int cols) +__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset, + __global const uchar *dyptr, int dy_step, int dy_offset, + __global uchar *map, int map_step, int map_offset, int rows, int cols, + int low_thr, int high_thr) { - struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 }; - - __local int smem[18][18]; - - int2 blockIdx = (int2)(get_group_id(0), get_group_id(1)); - int2 blockDim = (int2)(get_local_size(0), get_local_size(1)); - int2 threadIdx = (int2)(get_local_id(0), get_local_id(1)); - - const int x = blockIdx.x * blockDim.x + threadIdx.x; - const int y = blockIdx.y * blockDim.y + threadIdx.y; - - smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0; - if (threadIdx.y == 0) - smem[0][threadIdx.x + 1] = x < map.cols ? get(map, y - 1, x) : 0; - if (threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0; - if (threadIdx.x == 0) - smem[threadIdx.y + 1][0] = y < map.rows ? get(map, y, x - 1) : 0; - if (threadIdx.x == blockDim.x - 1) - smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols && y < map.rows ? get(map, y, x + 1) : 0; - if (threadIdx.x == 0 && threadIdx.y == 0) - smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0; - if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) - smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0; - if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0; - if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) - smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0; - - barrier(CLK_LOCAL_MEM_FENCE); + int start_x = get_group_id(0) * GRP_SIZEX; + int start_y = get_group_id(1) * GRP_SIZEY; - if (x >= cols || y >= rows) - return; + int lidx = get_local_id(0); + int lidy = get_local_id(1); - int n; + __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)]; + __local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)]; - #pragma unroll - for (int k = 0; k < 16; ++k) +#pragma unroll + for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY) { - n = 0; + int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1); + int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1); - if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) - { - n += smem[threadIdx.y ][threadIdx.x ] == 2; - n += smem[threadIdx.y ][threadIdx.x + 1] == 2; - n += smem[threadIdx.y ][threadIdx.x + 2] == 2; + int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset)); + int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset)); - n += smem[threadIdx.y + 1][threadIdx.x ] == 2; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; + __global short *dx = loadpix(dxptr + dx_index); + __global short *dy = loadpix(dyptr + dy_index); - n += smem[threadIdx.y + 2][threadIdx.x ] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; + int mag0 = dist(dx[0], dy[0]); +#if cn > 1 + short cdx = dx[0], cdy = dy[0]; +#pragma unroll + for (int j = 1; j < cn; ++j) + { + int mag1 = dist(dx[j], dy[j]); + if (mag1 > mag0) + { + mag0 = mag1; + cdx = dx[j]; + cdy = dy[j]; + } } - - if (n > 0) - smem[threadIdx.y + 1][threadIdx.x + 1] = 2; + dx[0] = cdx; + dy[0] = cdy; +#endif + mag[i] = mag0; + sigma[i] = (short2)(dx[0], dy[0]); } - const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; - set(map, y, x, e); - n = 0; + barrier(CLK_LOCAL_MEM_FENCE); - if (e == 2) - { - n += smem[threadIdx.y ][threadIdx.x ] == 1; - n += smem[threadIdx.y ][threadIdx.x + 1] == 1; - n += smem[threadIdx.y ][threadIdx.x + 2] == 1; + int gidx = get_global_id(0); + int gidy = get_global_id(1); - n += smem[threadIdx.y + 1][threadIdx.x ] == 1; - n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; + if (gidx >= cols || gidy >= rows) + return; - n += smem[threadIdx.y + 2][threadIdx.x ] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; - n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; - } + lidx++; + lidy++; + + int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)]; + short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x; + short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y; - if (n > 0) + int value = 1; + if (mag0 > low_thr) { - const int ind = atomic_inc(counter); - st[ind] = (ushort2)(x + 1, y + 1); + int a = (y / (float)x) * TG22; + int b = (y / (float)x) * TG67; + + a = min((int)abs(a), 1) + 1; + b = min((int)abs(b), 1); + + int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); + int dir = a * b + 2 * dir3; + int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]]; + int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1); + + if (mag0 > prev_mag && mag0 >= next_mag) + { + value = (mag0 > high_thr) ? 2 : 0; + } } + + storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset))); } -#elif defined OP_HYST_GLOBAL +#undef TG22 +#undef CANNY_SHIFT -__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; -__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; +#elif defined STAGE2 +/* + stage2: + hysteresis (add edges labeled 0 if they are connected with an edge labeled 2) +*/ +#define loadpix(addr) *(__global int *)(addr) +#define storepix(val, addr) *(__global int *)(addr) = (int)(val) +#define l_stack_size 256 +#define p_stack_size 8 -#define stack_size 512 -#define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int)) +__constant short move_dir[2][8] = { + { -1, -1, -1, 0, 0, 1, 1, 1 }, + { -1, 0, 1, -1, 1, -1, 0, 1 } +}; -__kernel void edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset, - __global ushort2 * st1, __global ushort2 * st2, __global int * counter, - int rows, int cols, int count) +__kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offset, int rows, int cols) { map += map_offset; - int lidx = get_local_id(0); + int x = get_global_id(0); + int y0 = get_global_id(1) * PIX_PER_WI; - int grp_idx = get_group_id(0); - int grp_idy = get_group_id(1); + int lid = get_local_id(0) + get_local_id(1) * 32; - __local unsigned int s_counter, s_ind; - __local ushort2 s_st[stack_size]; + __local ushort2 l_stack[l_stack_size]; + __local int l_counter; - if (lidx == 0) - s_counter = 0; + if (lid == 0) + l_counter = 0; barrier(CLK_LOCAL_MEM_FENCE); - int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx); - - if (ind < count) + #pragma unroll + for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y) { - ushort2 pos = st1[ind]; - if (lidx < 8) + int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int))); + if (type == 2) { - pos.x += c_dx[lidx]; - pos.y += c_dy[lidx]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1) - { - *(__global int *)(map + map_index) = 2; - ind = atomic_inc(&s_counter); - s_st[ind] = pos; - } + l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y); } - barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); - while (s_counter > 0 && s_counter <= stack_size - get_local_size(0)) - { - const int subTaskIdx = lidx >> 3; - const int portion = min(s_counter, (uint)(get_local_size(0)>> 3)); + ushort2 p_stack[p_stack_size]; + int p_counter = 0; - if (subTaskIdx < portion) - pos = s_st[s_counter - 1 - subTaskIdx]; - barrier(CLK_LOCAL_MEM_FENCE); + while(l_counter != 0) + { + int mod = l_counter % 64; + int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0; - if (lidx == 0) - s_counter -= portion; - barrier(CLK_LOCAL_MEM_FENCE); + #pragma unroll + for (int i = 0; i < pix_per_thr; ++i) + { + ushort2 pos = l_stack[ atomic_dec(&l_counter) - 1 ]; - if (subTaskIdx < portion) + #pragma unroll + for (int j = 0; j < 8; ++j) { - pos.x += c_dx[lidx & 7]; - pos.y += c_dy[lidx & 7]; - if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1) + ushort posx = pos.x + move_dir[0][j]; + ushort posy = pos.y + move_dir[1][j]; + if (posx < 0 || posy < 0 || posx >= cols || posy >= rows) + continue; + __global uchar *addr = map + mad24(posy, map_step, posx * (int)sizeof(int)); + int type = loadpix(addr); + if (type == 0) { - *(__global int *)(map + map_index) = 2; - ind = atomic_inc(&s_counter); - s_st[ind] = pos; + p_stack[p_counter++] = (ushort2)(posx, posy); + storepix(2, addr); } } - barrier(CLK_LOCAL_MEM_FENCE); } + barrier(CLK_LOCAL_MEM_FENCE); - if (s_counter > 0) + while (p_counter > 0) { - if (lidx == 0) - { - ind = atomic_add(counter, s_counter); - s_ind = ind - s_counter; - } - barrier(CLK_LOCAL_MEM_FENCE); - - ind = s_ind; - for (int i = lidx; i < (int)s_counter; i += get_local_size(0)) - st2[ind + i] = s_st[i]; + l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter]; } + barrier(CLK_LOCAL_MEM_FENCE); } } -#undef map_index -#undef stack_size - -#elif defined OP_EDGES +#elif defined GET_EDGES // Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0. -// map edge type mappings -// dst edge output +// map edge type mappings +// dst edge output -__kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offset, - __global uchar * dst, int dst_step, int dst_offset, int rows, int cols) +__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols, + __global uchar *dst, int dst_step, int dst_offset) { int x = get_global_id(0); - int y = get_global_id(1); + int y0 = get_global_id(1) * PIX_PER_WI; - if (y < rows && x < cols) + #pragma unroll + for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y) { - int map_index = mad24(map_step, y + 1, mad24(x + 1, (int)sizeof(int), map_offset)); - int dst_index = mad24(dst_step, y, x + dst_offset); + int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset)); + int dst_index = mad24(dst_step, y, x) + dst_offset; __global const int * map = (__global const int *)(mapptr + map_index); - dst[dst_index] = (uchar)(-(map[0] >> 1)); } } -#endif +#endif \ No newline at end of file