diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp
index eb6860dac384ffc715f4851c4f4b883d09af040e..cf2c6bb294bbb68b3b1ea2b3b19b5cac3c013af5 100644
--- a/modules/imgproc/src/canny.cpp
+++ b/modules/imgproc/src/canny.cpp
@@ -97,7 +97,23 @@ static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
 static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
                       int aperture_size, bool L2gradient, int cn, const Size & size)
 {
-    UMat dx(size, CV_16SC(cn)), dy(size, CV_16SC(cn));
+    UMat map;
+
+    const ocl::Device &dev = ocl::Device::getDefault();
+    int max_wg_size = (int)dev.maxWorkGroupSize();
+
+    int lSizeX = 32;
+    int lSizeY = max_wg_size / 32;
+
+    if (lSizeY == 0)
+    {
+        lSizeX = 16;
+        lSizeY = max_wg_size / 16;
+    }
+    if (lSizeY == 0)
+    {
+        lSizeY = 1;
+    }
 
     if (L2gradient)
     {
@@ -110,144 +126,103 @@ static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float
             high_thresh *= high_thresh;
     }
     int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
-    Size esize(size.width + 2, size.height + 2);
-
-    UMat mag;
-    size_t globalsize[2] = { size.width, size.height }, localsize[2] = { 16, 16 };
 
     if (aperture_size == 3 && !_src.isSubmatrix())
     {
-        // Sobel calculation
-        char cvt[2][40];
-        ocl::Kernel calcSobelRowPassKernel("calcSobelRowPass", ocl::imgproc::canny_oclsrc,
-                                           format("-D OP_SOBEL -D cn=%d -D shortT=%s -D ucharT=%s"
-                                                  " -D convertToIntT=%s -D intT=%s -D convertToShortT=%s", cn,
-                                                  ocl::typeToStr(CV_16SC(cn)),
-                                                  ocl::typeToStr(CV_8UC(cn)),
-                                                  ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]),
-                                                  ocl::typeToStr(CV_32SC(cn)),
-                                                  ocl::convertTypeStr(CV_32S, CV_16S, cn, cvt[1])));
-        if (calcSobelRowPassKernel.empty())
+        /*
+            stage1_with_sobel:
+                Sobel operator
+                Calc magnitudes
+                Non maxima suppression
+                Double thresholding
+        */
+        char cvt[40];
+        ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc,
+                               format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_intN=%s -D intN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
+                                      cn, ocl::memopTypeToStr(_src.depth()),
+                                      ocl::convertTypeStr(_src.type(), CV_32SC(cn), cn, cvt),
+                                      ocl::memopTypeToStr(CV_32SC(cn)),
+                                      lSizeX, lSizeY,
+                                      L2gradient ? " -D L2GRAD" : ""));
+        if (with_sobel.empty())
             return false;
 
-        UMat src = _src.getUMat(), dxBuf(size, CV_16SC(cn)), dyBuf(size, CV_16SC(cn));
-        calcSobelRowPassKernel.args(ocl::KernelArg::ReadOnly(src),
-                                    ocl::KernelArg::WriteOnlyNoSize(dxBuf),
-                                    ocl::KernelArg::WriteOnlyNoSize(dyBuf));
+        UMat src = _src.getUMat();
+        map.create(size, CV_32S);
+        with_sobel.args(ocl::KernelArg::ReadOnly(src),
+                        ocl::KernelArg::WriteOnlyNoSize(map),
+                        low, high);
 
-        if (!calcSobelRowPassKernel.run(2, globalsize, localsize, false))
-            return false;
-
-        // magnitude calculation
-        ocl::Kernel magnitudeKernel("calcMagnitude_buf", ocl::imgproc::canny_oclsrc,
-                                    format("-D cn=%d%s -D OP_MAG_BUF -D shortT=%s -D convertToIntT=%s -D intT=%s",
-                                           cn, L2gradient ? " -D L2GRAD" : "",
-                                           ocl::typeToStr(CV_16SC(cn)),
-                                           ocl::convertTypeStr(CV_16S, CV_32S, cn, cvt[0]),
-                                           ocl::typeToStr(CV_32SC(cn))));
-        if (magnitudeKernel.empty())
-            return false;
-
-        mag = UMat(esize, CV_32SC1, Scalar::all(0));
-        dx.create(size, CV_16SC(cn));
-        dy.create(size, CV_16SC(cn));
+        size_t globalsize[2] = { size.width, size.height },
+                localsize[2] = { lSizeX, lSizeY };
 
-        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dxBuf), ocl::KernelArg::ReadOnlyNoSize(dyBuf),
-                             ocl::KernelArg::WriteOnlyNoSize(dx), ocl::KernelArg::WriteOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
-
-        if (!magnitudeKernel.run(2, globalsize, localsize, false))
+        if (!with_sobel.run(2, globalsize, localsize, false))
             return false;
     }
     else
     {
+        /*
+            stage1_without_sobel:
+                Calc magnitudes
+                Non maxima suppression
+                Double thresholding
+        */
+        UMat dx, dy;
         Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
         Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
 
-        // magnitude calculation
-        ocl::Kernel magnitudeKernel("calcMagnitude", ocl::imgproc::canny_oclsrc,
-                                    format("-D OP_MAG -D cn=%d%s -D intT=int -D shortT=short -D convertToIntT=convert_int_sat",
-                                           cn, L2gradient ? " -D L2GRAD" : ""));
-        if (magnitudeKernel.empty())
+        ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc,
+                                    format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
+                                           cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : ""));
+        if (without_sobel.empty())
             return false;
 
-        mag = UMat(esize, CV_32SC1, Scalar::all(0));
-        magnitudeKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                             ocl::KernelArg::WriteOnlyNoSize(mag), size.height, size.width);
+        map.create(size, CV_32S);
+        without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
+                           ocl::KernelArg::WriteOnly(map),
+                           low, high);
+
+        size_t globalsize[2] = { size.width, size.height },
+                localsize[2] = { lSizeX, lSizeY };
 
-        if (!magnitudeKernel.run(2, globalsize, NULL, false))
+        if (!without_sobel.run(2, globalsize, localsize, false))
             return false;
     }
 
-    // map calculation
-    ocl::Kernel calcMapKernel("calcMap", ocl::imgproc::canny_oclsrc,
-                              format("-D OP_MAP -D cn=%d", cn));
-    if (calcMapKernel.empty())
-        return false;
-
-    UMat map(esize, CV_32SC1);
-    calcMapKernel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
-                       ocl::KernelArg::ReadOnlyNoSize(mag), ocl::KernelArg::WriteOnlyNoSize(map),
-                       size.height, size.width, low, high);
-
-    if (!calcMapKernel.run(2, globalsize, localsize, false))
-        return false;
+    int PIX_PER_WI = 8;
+    /*
+        stage2:
+            hysteresis (add weak edges if they are connected with strong edges)
+    */
 
-    // local hysteresis thresholding
-    ocl::Kernel edgesHysteresisLocalKernel("edgesHysteresisLocal", ocl::imgproc::canny_oclsrc,
-                                           "-D OP_HYST_LOCAL");
-    if (edgesHysteresisLocalKernel.empty())
-        return false;
+    ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
+                                format("-D STAGE2 -D PIX_PER_WI=%d", PIX_PER_WI));
 
-    UMat stack(1, size.area(), CV_16UC2), counter(1, 1, CV_32SC1, Scalar::all(0));
-    edgesHysteresisLocalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
-                                    ocl::KernelArg::PtrReadWrite(counter), size.height, size.width);
-    if (!edgesHysteresisLocalKernel.run(2, globalsize, localsize, false))
+    if (edgesHysteresis.empty())
         return false;
 
-    // global hysteresis thresholding
-    UMat stack2(1, size.area(), CV_16UC2);
-    int count;
-
-    for ( ; ; )
-    {
-        ocl::Kernel edgesHysteresisGlobalKernel("edgesHysteresisGlobal", ocl::imgproc::canny_oclsrc,
-                                                "-D OP_HYST_GLOBAL");
-        if (edgesHysteresisGlobalKernel.empty())
-            return false;
-
-        {
-            Mat _counter = counter.getMat(ACCESS_RW);
-            count = _counter.at<int>(0, 0);
-            if (count == 0)
-                break;
-
-            _counter.at<int>(0, 0) = 0;
-        }
-
-        edgesHysteresisGlobalKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::PtrReadWrite(stack),
-                                         ocl::KernelArg::PtrReadWrite(stack2), ocl::KernelArg::PtrReadWrite(counter),
-                                         size.height, size.width, count);
+    edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
 
-#define divUp(total, grain) ((total + grain - 1) / grain)
-        size_t localsize2[2] = { 128, 1 }, globalsize2[2] = { std::min(count, 65535) * 128, divUp(count, 65535) };
-#undef divUp
+    int sizey = lSizeY / PIX_PER_WI;
+    if (sizey == 0)
+        sizey = 1;
 
-        if (!edgesHysteresisGlobalKernel.run(2, globalsize2, localsize2, false))
-            return false;
+    size_t globalsize[2] = { size.width, size.height / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
 
-        std::swap(stack, stack2);
-    }
+    if (!edgesHysteresis.run(2, globalsize, localsize, false))
+        return false;
 
     // get edges
-    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc, "-D OP_EDGES");
+
+    ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc,
+                                format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI));
     if (getEdgesKernel.empty())
         return false;
 
     _dst.create(size, CV_8UC1);
     UMat dst = _dst.getUMat();
 
-    getEdgesKernel.args(ocl::KernelArg::ReadOnlyNoSize(map), ocl::KernelArg::WriteOnly(dst));
+    getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst));
 
     return getEdgesKernel.run(2, globalsize, NULL, false);
 }
diff --git a/modules/imgproc/src/opencl/canny.cl b/modules/imgproc/src/opencl/canny.cl
index 99cfc3b63734191e3bf6789a7d420fbafa3869c9..da2750e3487a2e21f923b1d7d04075735a4a520c 100644
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@@ -43,520 +43,433 @@
 //
 //M*/
 
-#ifdef OP_SOBEL
+#define TG22 0.4142135623730950488016887242097f
+#define TG67 2.4142135623730950488016887242097f
 
-#if cn != 3
-#define loadpix(addr) convertToIntT(*(__global const ucharT *)(addr))
-#define storepix(val, addr) *(__global shortT *)(addr) = convertToShortT(val)
-#define shortSize (int)sizeof(shortT)
+#ifdef WITH_SOBEL
+
+#if cn == 1
+#define loadpix(addr) convert_intN(*(__global const TYPE *)(addr))
 #else
-#define loadpix(addr) convertToIntT(vload3(0, (__global const uchar *)(addr)))
-#define storepix(val, addr) vstore3(convertToShortT(val), 0, (__global short *)(addr))
-#define shortSize (int)sizeof(short) * cn
+#define loadpix(addr) convert_intN(vload3(0, (__global const TYPE *)(addr)))
 #endif
+#define storepix(value, addr) *(__global int *)(addr) = (int)(value)
+
+/*
+    stage1_with_sobel:
+        Sobel operator
+        Calc magnitudes
+        Non maxima suppression
+        Double thresholding
+*/
+
+__constant int prev[4][2] = {
+    { 0, -1 },
+    { -1, 1 },
+    { -1, 0 },
+    { -1, -1 }
+};
 
-// Smoothing perpendicular to the derivative direction with a triangle filter
-// only support 3x3 Sobel kernel
-// h (-1) =  1, h (0) =  2, h (1) =  1
-// h'(-1) = -1, h'(0) =  0, h'(1) =  1
-// thus sobel 2D operator can be calculated as:
-// h'(x, y) = h'(x)h(y) for x direction
-//
-// src		input 8bit single channel image data
-// dx_buf	output dx buffer
-// dy_buf	output dy buffer
+__constant int next[4][2] = {
+    { 0, 1 },
+    { 1, -1 },
+    { 1, 0 },
+    { 1, 1 }
+};
 
-__kernel void calcSobelRowPass(__global const uchar * src, int src_step, int src_offset, int rows, int cols,
-                               __global uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
-                               __global uchar * dy_buf, int dy_buf_step, int dy_buf_offset)
+inline int3 sobel(int idx, __local const intN *smem)
 {
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
+    // result: x, y, mag
+    int3 res;
 
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    intN dx = smem[idx + 2] - smem[idx]
+        + 2 * (smem[idx + GRP_SIZEX + 6] - smem[idx + GRP_SIZEX + 4])
+        + smem[idx + 2 * GRP_SIZEX + 10] - smem[idx + 2 * GRP_SIZEX + 8];
 
-    __local intT smem[16][18];
+    intN dy = smem[idx] - smem[idx + 2 * GRP_SIZEX + 8]
+        + 2 * (smem[idx + 1] - smem[idx + 2 * GRP_SIZEX + 9])
+        + smem[idx + 2] - smem[idx + 2 * GRP_SIZEX + 10];
 
-    smem[lidy][lidx + 1] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(gidx, cn, src_offset)));
-    if (lidx == 0)
+#ifdef L2GRAD
+    intN magN = dx * dx + dy * dy;
+#else
+    intN magN = convert_intN(abs(dx) + abs(dy));
+#endif
+#if cn == 1
+    res.z = magN;
+    res.x = dx;
+    res.y = dy;
+#else
+    res.z = max(magN.x, max(magN.y, magN.z));
+    if (res.z == magN.y)
     {
-        smem[lidy][0]  = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(max(gidx - 1,  0), cn, src_offset)));
-        smem[lidy][17] = loadpix(src + mad24(src_step, min(gidy, rows - 1), mad24(min(gidx + 16, cols - 1), cn, src_offset)));
+        dx.x = dx.y;
+        dy.x = dy.y;
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (gidy < rows && gidx < cols)
+    else if (res.z == magN.z)
     {
-        storepix(smem[lidy][lidx + 2] - smem[lidy][lidx],
-                 dx_buf + mad24(gidy, dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
-        storepix(mad24(2, smem[lidy][lidx + 1], smem[lidy][lidx] + smem[lidy][lidx + 2]),
-                 dy_buf + mad24(gidy, dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+        dx.x = dx.z;
+        dy.x = dy.z;
     }
-}
-
-#elif defined OP_MAG_BUF || defined OP_MAG
-
-inline intT calc(shortT x, shortT y)
-{
-#ifdef L2GRAD
-    intT intx = convertToIntT(x), inty = convertToIntT(y);
-    return intx * intx + inty * inty;
-#else
-    return convertToIntT( (x >= (shortT)(0) ? x : -x) + (y >= (shortT)(0) ? y : -y) );
+    res.x = dx.x;
+    res.y = dy.x;
 #endif
-}
 
-#ifdef OP_MAG
+    return res;
+}
 
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the non-buffered version(non-3x3 sobel)
-//
-// dx_buf		dx buffer, calculated from calcSobelRowPass
-// dy_buf		dy buffer, calculated from calcSobelRowPass
-// dx			direvitive in x direction output
-// dy			direvitive in y direction output
-// mag			magnitude direvitive of xy output
-
-__kernel void calcMagnitude(__global const uchar * dxptr, int dx_step, int dx_offset,
-                            __global const uchar * dyptr, int dy_step, int dy_offset,
-                            __global uchar * magptr, int mag_step, int mag_offset, int rows, int cols)
+__kernel void stage1_with_sobel(__global const uchar *src, int src_step, int src_offset, int rows, int cols,
+                                __global uchar *map, int map_step, int map_offset,
+                                int low_thr, int high_thr)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (y < rows && x < cols)
-    {
-        int dx_index = mad24(dx_step, y, mad24(x, (int)sizeof(short) * cn, dx_offset));
-        int dy_index = mad24(dy_step, y, mad24(x, (int)sizeof(short) * cn, dy_offset));
-        int mag_index = mad24(mag_step, y + 1, mad24(x + 1, (int)sizeof(int), mag_offset));
+    __local intN smem[(GRP_SIZEX + 4) * (GRP_SIZEY + 4)];
 
-        __global short * dx = (__global short *)(dxptr + dx_index);
-        __global short * dy = (__global short *)(dyptr + dy_index);
-        __global int * mag = (__global int *)(magptr + mag_index);
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
 
-        int cmag = calc(dx[0], dy[0]);
-#if cn > 1
-        short cx = dx[0], cy = dy[0];
-        int pmag;
+    int start_x = GRP_SIZEX * get_group_id(0);
+    int start_y = GRP_SIZEY * get_group_id(1);
 
-        #pragma unroll
-        for (int i = 1; i < cn; ++i)
-        {
-            pmag = calc(dx[i], dy[i]);
-            if (pmag > cmag)
-                cmag = pmag, cx = dx[i], cy = dy[i];
-        }
-
-        dx[0] = cx, dy[0] = cy;
-#endif
-        mag[0] = cmag;
+    int i = lidx + lidy * GRP_SIZEX;
+    for (int j = i;  j < (GRP_SIZEX + 4) * (GRP_SIZEY + 4); j += GRP_SIZEX * GRP_SIZEY)
+    {
+        int x = clamp(start_x - 2 + (j % (GRP_SIZEX + 4)), 0, cols - 1);
+        int y = clamp(start_y - 2 + (j / (GRP_SIZEX + 4)), 0, rows - 1);
+        smem[j] = loadpix(src + mad24(y, src_step, mad24(x, cn * (int)sizeof(TYPE), src_offset)));
     }
-}
 
-#elif defined OP_MAG_BUF
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-#if cn != 3
-#define loadpix(addr) *(__global const shortT *)(addr)
-#define shortSize (int)sizeof(shortT)
-#else
-#define loadpix(addr) vload3(0, (__global const short *)(addr))
-#define shortSize (int)sizeof(short)*cn
-#endif
+    //// Sobel, Magnitude
+    //
 
-// calculate the magnitude of the filter pass combining both x and y directions
-// This is the buffered version(3x3 sobel)
-//
-// dx_buf		dx buffer, calculated from calcSobelRowPass
-// dy_buf		dy buffer, calculated from calcSobelRowPass
-// dx			direvitive in x direction output
-// dy			direvitive in y direction output
-// mag			magnitude direvitive of xy output
-__kernel void calcMagnitude_buf(__global const uchar * dx_buf, int dx_buf_step, int dx_buf_offset,
-                                __global const uchar * dy_buf, int dy_buf_step, int dy_buf_offset,
-                                __global uchar * dx, int dx_step, int dx_offset,
-                                __global uchar * dy, int dy_step, int dy_offset,
-                                __global uchar * mag, int mag_step, int mag_offset, int rows, int cols)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
+    __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
 
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    lidx++;
+    lidy++;
 
-    __local shortT sdx[18][16];
-    __local shortT sdy[18][16];
-
-    sdx[lidy + 1][lidx] = loadpix(dx_buf + mad24(min(gidy, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
-    sdy[lidy + 1][lidx] = loadpix(dy_buf + mad24(min(gidy, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
-    if (lidy == 0)
+    if (i < GRP_SIZEX + 2)
     {
-        sdx[0][lidx]  = loadpix(dx_buf + mad24(clamp(gidy - 1, 0, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
-        sdx[17][lidx] = loadpix(dx_buf + mad24(min(gidy + 16, rows - 1), dx_buf_step, mad24(gidx, shortSize, dx_buf_offset)));
-
-        sdy[0][lidx]  = loadpix(dy_buf + mad24(clamp(gidy - 1, 0, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
-        sdy[17][lidx] = loadpix(dy_buf + mad24(min(gidy + 16, rows - 1), dy_buf_step, mad24(gidx, shortSize, dy_buf_offset)));
+        int grp_sizey = min(GRP_SIZEY + 1, rows - start_y);
+        mag[i] = (sobel(i, smem)).z;
+        mag[i + grp_sizey * (GRP_SIZEX + 2)] = (sobel(i + grp_sizey * (GRP_SIZEX + 4), smem)).z;
     }
+    if (i < GRP_SIZEY + 2)
+    {
+        int grp_sizex = min(GRP_SIZEX + 1, cols - start_x);
+        mag[i * (GRP_SIZEX + 2)] = (sobel(i * (GRP_SIZEX + 4), smem)).z;
+        mag[i * (GRP_SIZEX + 2) + grp_sizex] = (sobel(i * (GRP_SIZEX + 4) + grp_sizex, smem)).z;
+    }
+
+    int idx = lidx + lidy * (GRP_SIZEX + 4);
+    i = lidx + lidy * (GRP_SIZEX + 2);
+
+    int3 res = sobel(idx, smem);
+    mag[i] = res.z;
+    int x = res.x;
+    int y = res.y;
+
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (gidx < cols && gidy < rows)
-    {
-        shortT x = sdx[lidy + 1][lidx] * (shortT)(2) + sdx[lidy][lidx] + sdx[lidy + 2][lidx];
-        shortT y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+    //// Threshold + Non maxima suppression
+    //
 
-#if cn == 1
-        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = x;
-        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = y;
+    /*
+        Sector numbers
 
-        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = calc(x, y);
-#elif cn == 3
-        intT magv = calc(x, y);
-        short cx = x.x, cy = y.x;
-        int cmag = magv.x;
+        3   2   1
+         *  *  *
+          * * *
+        0*******0
+          * * *
+         *  *  *
+        1   2   3
 
-        if (cmag < magv.y)
-            cx = x.y, cy = y.y, cmag = magv.y;
-        if (cmag < magv.z)
-            cx = x.z, cy = y.z, cmag = magv.z;
+        We need to determine arctg(dy / dx) to one of the four directions: 0, 45, 90 or 135 degrees.
+        Therefore if abs(dy / dx) belongs to the interval
+        [0, tg(22.5)]           -> 0 direction
+        [tg(22.5), tg(67.5)]    -> 1 or 3
+        [tg(67,5), +oo)         -> 2
 
-        *(__global short *)(dx + mad24(gidy, dx_step, mad24(gidx, shortSize, dx_offset))) = cx;
-        *(__global short *)(dy + mad24(gidy, dy_step, mad24(gidx, shortSize, dy_offset))) = cy;
+        Since tg(67.5) = 1 / tg(22.5), if we take
+        a = abs(dy / dx) * tg(22.5) and b = abs(dy / dx) * tg(67.5)
+        we can get another intervals
 
-        *(__global int *)(mag + mad24(gidy + 1, mag_step, mad24(gidx + 1, (int)sizeof(int), mag_offset))) = cmag;
-#endif
-    }
-}
+        in case a:
+        [0, tg(22.5)^2]     -> 0
+        [tg(22.5)^2, 1]     -> 1, 3
+        [1, +oo)            -> 2
 
-#endif
+        in case b:
+        [0, 1]              -> 0
+        [1, tg(67.5)^2]     -> 1,3
+        [tg(67.5)^2, +oo)   -> 2
 
-#elif defined OP_MAP
-
-//////////////////////////////////////////////////////////////////////////////////////////
-// 0.4142135623730950488016887242097 is tan(22.5)
-
-#define CANNY_SHIFT 15
-#define TG22        (int)(0.4142135623730950488016887242097f*(1<<CANNY_SHIFT) + 0.5f)
-
-// First pass of edge detection and non-maximum suppression
-// edgetype is set to for each pixel:
-// 0 - below low thres, not an edge
-// 1 - maybe an edge
-// 2 - is an edge, either magnitude is greater than high thres, or
-//     Given estimates of the image gradients, a search is then carried out
-//     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
-//     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
-//     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
-//     if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
-//     if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
-//
-// dx, dy		direvitives of x and y direction
-// mag			magnitudes calculated from calcMagnitude function
-// map			output containing raw edge types
-
-__kernel void calcMap(__global const uchar * dx, int dx_step, int dx_offset,
-                      __global const uchar * dy, int dy_step, int dy_offset,
-                      __global const uchar * mag, int mag_step, int mag_offset,
-                      __global uchar * map, int map_step, int map_offset,
-                      int rows, int cols, int low_thresh, int high_thresh)
-{
-    __local int smem[18][18];
+        that can help to find direction without conditions.
+
+        0 - might belong to an edge
+        1 - pixel doesn't belong to an edge
+        2 - belong to an edge
+    */
 
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
 
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int grp_idx = get_global_id(0) & 0xFFFFF0;
-    int grp_idy = get_global_id(1) & 0xFFFFF0;
+    if (gidx >= cols || gidy >= rows)
+        return;
 
-    int tid = mad24(lidy, 16, lidx);
-    int lx = tid % 18;
-    int ly = tid / 18;
+    int mag0 = mag[i];
 
-    mag += mag_offset;
-    if (ly < 14)
-        smem[ly][lx] = *(__global const int *)(mag +
-            mad24(mag_step, min(grp_idy + ly, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
-    if (ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
-        smem[ly + 14][lx] = *(__global const int *)(mag +
-            mad24(mag_step, min(grp_idy + ly + 14, rows - 1), (int)sizeof(int) * (grp_idx + lx)));
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (gidy < rows && gidx < cols)
+    int value = 1;
+    if (mag0 > low_thr)
     {
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-        int m = smem[lidy + 1][lidx + 1];
+        int a = (y / (float)x) * TG22;
+        int b = (y / (float)x) * TG67;
 
-        if (m > low_thresh)
-        {
-            short xs = *(__global const short *)(dx + mad24(gidy, dx_step, mad24(gidx, (int)sizeof(short) * cn, dx_offset)));
-            short ys = *(__global const short *)(dy + mad24(gidy, dy_step, mad24(gidx, (int)sizeof(short) * cn, dy_offset)));
-            int x = abs(xs), y = abs(ys);
+        a = min((int)abs(a), 1) + 1;
+        b = min((int)abs(b), 1);
 
-            int tg22x = x * TG22;
-            y <<= CANNY_SHIFT;
+        //  a = { 1, 2 }
+        //  b = { 0, 1 }
+        //  a * b = { 0, 1, 2 } - directions that we need ( + 3 if x ^ y < 0)
 
-            if (y < tg22x)
-            {
-                if (m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
-                if (y > tg67x)
-                {
-                    if (m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
-                        edge_type = 1 + (int)(m > high_thresh);
-                }
-                else
-                {
-                    int s = (xs ^ ys) < 0 ? -1 : 1;
-                    if (m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
-                        edge_type = 1 + (int)(m > high_thresh);
-                }
-            }
+        int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31); // if a = 1, b = 1, dy ^ dx < 0
+        int dir = a * b + 2 * dir3;
+        int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
+        int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
+
+        if (mag0 > prev_mag && mag0 >= next_mag)
+        {
+            value = (mag0 > high_thr) ? 2 : 0;
         }
-        *(__global int *)(map + mad24(map_step, gidy + 1, mad24(gidx + 1, (int)sizeof(int), + map_offset))) = edge_type;
     }
+
+    storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
 }
 
-#undef CANNY_SHIFT
-#undef TG22
+#elif defined WITHOUT_SOBEL
 
-#elif defined OP_HYST_LOCAL
+/*
+    stage1_without_sobel:
+        Calc magnitudes
+        Non maxima suppression
+        Double thresholding
+*/
 
-struct PtrStepSz
-{
-    __global uchar * ptr;
-    int step, rows, cols;
-};
+#define loadpix(addr) (__global short *)(addr)
+#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
 
-inline int get(struct PtrStepSz data, int y, int x)
-{
-    return *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1)));
-}
+#ifdef L2GRAD
+#define dist(x, y) ((int)(x) * (x) + (int)(y) * (y))
+#else
+#define dist(x, y) (abs(x) + abs(y))
+#endif
 
-inline void set(struct PtrStepSz data, int y, int x, int value)
-{
-    *(__global int *)(data.ptr + mad24(data.step, y + 1, (int)sizeof(int) * (x + 1))) = value;
-}
+__constant int prev[4][2] = {
+    { 0, -1 },
+    { -1, -1 },
+    { -1, 0 },
+    { -1, 1 }
+};
 
-// perform Hysteresis for pixel whose edge type is 1
-//
-// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
-// marked as edge. Each thread will iterate for 16 times to connect local edges.
-// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
-// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
-//
-// map		raw edge type results calculated from calcMap.
-// stack	the potiential edge points found in this kernel call
-// counter	the number of potiential edge points
+__constant int next[4][2] = {
+    { 0, 1 },
+    { 1, 1 },
+    { 1, 0 },
+    { 1, -1 }
+};
 
-__kernel void edgesHysteresisLocal(__global uchar * map_ptr, int map_step, int map_offset,
-                                   __global ushort2 * st, __global unsigned int * counter,
-                                   int rows, int cols)
+__kernel void stage1_without_sobel(__global const uchar *dxptr, int dx_step, int dx_offset,
+                                   __global const uchar *dyptr, int dy_step, int dy_offset,
+                                   __global uchar *map, int map_step, int map_offset, int rows, int cols,
+                                   int low_thr, int high_thr)
 {
-    struct PtrStepSz map = { map_ptr + map_offset, map_step, rows + 1, cols + 1 };
-
-    __local int smem[18][18];
-
-    int2 blockIdx = (int2)(get_group_id(0), get_group_id(1));
-    int2 blockDim = (int2)(get_local_size(0), get_local_size(1));
-    int2 threadIdx = (int2)(get_local_id(0), get_local_id(1));
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? get(map, y, x) : 0;
-    if (threadIdx.y == 0)
-        smem[0][threadIdx.x + 1] = x < map.cols ? get(map, y - 1, x) : 0;
-    if (threadIdx.y == blockDim.y - 1)
-        smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? get(map, y + 1, x) : 0;
-    if (threadIdx.x == 0)
-        smem[threadIdx.y + 1][0] = y < map.rows ? get(map, y, x - 1) : 0;
-    if (threadIdx.x == blockDim.x - 1)
-        smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols && y < map.rows ? get(map, y, x + 1) : 0;
-    if (threadIdx.x == 0 && threadIdx.y == 0)
-        smem[0][0] = y > 0 && x > 0 ? get(map, y - 1, x - 1) : 0;
-    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-        smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? get(map, y - 1, x + 1) : 0;
-    if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-        smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? get(map, y + 1, x - 1) : 0;
-    if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-        smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? get(map, y + 1, x + 1) : 0;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
+    int start_x = get_group_id(0) * GRP_SIZEX;
+    int start_y = get_group_id(1) * GRP_SIZEY;
 
-    if (x >= cols || y >= rows)
-        return;
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
 
-    int n;
+    __local int mag[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
+    __local short2 sigma[(GRP_SIZEX + 2) * (GRP_SIZEY + 2)];
 
-    #pragma unroll
-    for (int k = 0; k < 16; ++k)
+#pragma unroll
+    for (int i = lidx + lidy * GRP_SIZEX; i < (GRP_SIZEX + 2) * (GRP_SIZEY + 2); i += GRP_SIZEX * GRP_SIZEY)
     {
-        n = 0;
+        int x = clamp(start_x - 1 + i % (GRP_SIZEX + 2), 0, cols - 1);
+        int y = clamp(start_y - 1 + i / (GRP_SIZEX + 2), 0, rows - 1);
 
-        if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+        int dx_index = mad24(y, dx_step, mad24(x, cn * (int)sizeof(short), dx_offset));
+        int dy_index = mad24(y, dy_step, mad24(x, cn * (int)sizeof(short), dy_offset));
 
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+        __global short *dx = loadpix(dxptr + dx_index);
+        __global short *dy = loadpix(dyptr + dy_index);
 
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+        int mag0 = dist(dx[0], dy[0]);
+#if cn > 1
+        short cdx = dx[0], cdy = dy[0];
+#pragma unroll
+        for (int j = 1; j < cn; ++j)
+        {
+            int mag1 = dist(dx[j], dy[j]);
+            if (mag1 > mag0)
+            {
+                mag0 = mag1;
+                cdx = dx[j];
+                cdy = dy[j];
+            }
         }
-
-        if (n > 0)
-            smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+        dx[0] = cdx;
+        dy[0] = cdy;
+#endif
+        mag[i] = mag0;
+        sigma[i] = (short2)(dx[0], dy[0]);
     }
 
-    const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-    set(map, y, x, e);
-    n = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (e == 2)
-    {
-        n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-        n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-        n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
 
-        n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+    if (gidx >= cols || gidy >= rows)
+        return;
 
-        n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-    }
+    lidx++;
+    lidy++;
+
+    int mag0 = mag[lidx + lidy * (GRP_SIZEX + 2)];
+    short x = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).x;
+    short y = (sigma[lidx + lidy * (GRP_SIZEX + 2)]).y;
 
-    if (n > 0)
+    int value = 1;
+    if (mag0 > low_thr)
     {
-        const int ind = atomic_inc(counter);
-        st[ind] = (ushort2)(x + 1, y + 1);
+        int a = (y / (float)x) * TG22;
+        int b = (y / (float)x) * TG67;
+
+        a = min((int)abs(a), 1) + 1;
+        b = min((int)abs(b), 1);
+
+        int dir3 = (a * b) & (((x ^ y) & 0x80000000) >> 31);
+        int dir = a * b + 2 * dir3;
+        int prev_mag = mag[(lidy + prev[dir][0]) * (GRP_SIZEX + 2) + lidx + prev[dir][1]];
+        int next_mag = mag[(lidy + next[dir][0]) * (GRP_SIZEX + 2) + lidx + next[dir][1]] + (dir & 1);
+
+        if (mag0 > prev_mag && mag0 >= next_mag)
+        {
+            value = (mag0 > high_thr) ? 2 : 0;
+        }
     }
+
+    storepix(value, map + mad24(gidy, map_step, mad24(gidx, (int)sizeof(int), map_offset)));
 }
 
-#elif defined OP_HYST_GLOBAL
+#undef TG22
+#undef CANNY_SHIFT
 
-__constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-__constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+#elif defined STAGE2
+/*
+    stage2:
+        hysteresis (add edges labeled 0 if they are connected with an edge labeled 2)
+*/
 
+#define loadpix(addr) *(__global int *)(addr)
+#define storepix(val, addr) *(__global int *)(addr) = (int)(val)
+#define l_stack_size 256
+#define p_stack_size 8
 
-#define stack_size 512
-#define map_index mad24(map_step, pos.y, pos.x * (int)sizeof(int))
+__constant short move_dir[2][8] = {
+    { -1, -1, -1, 0, 0, 1, 1, 1 },
+    { -1, 0, 1, -1, 1, -1, 0, 1 }
+};
 
-__kernel void edgesHysteresisGlobal(__global uchar * map, int map_step, int map_offset,
-                                    __global ushort2 * st1, __global ushort2 * st2, __global int * counter,
-                                    int rows, int cols, int count)
+__kernel void stage2_hysteresis(__global uchar *map, int map_step, int map_offset, int rows, int cols)
 {
     map += map_offset;
 
-    int lidx = get_local_id(0);
+    int x = get_global_id(0);
+    int y0 = get_global_id(1) * PIX_PER_WI;
 
-    int grp_idx = get_group_id(0);
-    int grp_idy = get_group_id(1);
+    int lid = get_local_id(0) + get_local_id(1) * 32;
 
-    __local unsigned int s_counter, s_ind;
-    __local ushort2 s_st[stack_size];
+    __local ushort2 l_stack[l_stack_size];
+    __local int l_counter;
 
-    if (lidx == 0)
-        s_counter = 0;
+    if (lid == 0)
+        l_counter = 0;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
-
-    if (ind < count)
+    #pragma unroll
+    for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
     {
-        ushort2 pos = st1[ind];
-        if (lidx < 8)
+        int type = loadpix(map + mad24(y, map_step, x * (int)sizeof(int)));
+        if (type == 2)
         {
-            pos.x += c_dx[lidx];
-            pos.y += c_dy[lidx];
-            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
-            {
-                *(__global int *)(map + map_index) = 2;
-                ind = atomic_inc(&s_counter);
-                s_st[ind] = pos;
-            }
+            l_stack[atomic_inc(&l_counter)] = (ushort2)(x, y);
         }
-        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-        while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
-        {
-            const int subTaskIdx = lidx >> 3;
-            const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+    ushort2 p_stack[p_stack_size];
+    int p_counter = 0;
 
-            if (subTaskIdx < portion)
-                pos = s_st[s_counter - 1 - subTaskIdx];
-            barrier(CLK_LOCAL_MEM_FENCE);
+    while(l_counter != 0)
+    {
+        int mod = l_counter % 64;
+        int pix_per_thr = l_counter / 64 + (lid < mod) ? 1 : 0;
 
-            if (lidx == 0)
-                s_counter -= portion;
-            barrier(CLK_LOCAL_MEM_FENCE);
+        #pragma unroll
+        for (int i = 0; i < pix_per_thr; ++i)
+        {
+            ushort2 pos = l_stack[ atomic_dec(&l_counter) - 1 ];
 
-            if (subTaskIdx < portion)
+            #pragma unroll
+            for (int j = 0; j < 8; ++j)
             {
-                pos.x += c_dx[lidx & 7];
-                pos.y += c_dy[lidx & 7];
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows && *(__global int *)(map + map_index) == 1)
+                ushort posx = pos.x + move_dir[0][j];
+                ushort posy = pos.y + move_dir[1][j];
+                if (posx < 0 || posy < 0 || posx >= cols || posy >= rows)
+                    continue;
+                __global uchar *addr = map + mad24(posy, map_step, posx * (int)sizeof(int));
+                int type = loadpix(addr);
+                if (type == 0)
                 {
-                    *(__global int *)(map + map_index) = 2;
-                    ind = atomic_inc(&s_counter);
-                    s_st[ind] = pos;
+                    p_stack[p_counter++] = (ushort2)(posx, posy);
+                    storepix(2, addr);
                 }
             }
-            barrier(CLK_LOCAL_MEM_FENCE);
         }
+        barrier(CLK_LOCAL_MEM_FENCE);
 
-        if (s_counter > 0)
+        while (p_counter > 0)
         {
-            if (lidx == 0)
-            {
-                ind = atomic_add(counter, s_counter);
-                s_ind = ind - s_counter;
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-
-            ind = s_ind;
-            for (int i = lidx; i < (int)s_counter; i += get_local_size(0))
-                st2[ind + i] = s_st[i];
+            l_stack[ atomic_inc(&l_counter) ] = p_stack[--p_counter];
         }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 }
 
-#undef map_index
-#undef stack_size
-
-#elif defined OP_EDGES
+#elif defined GET_EDGES
 
 // Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
-// map		edge type mappings
-// dst		edge output
+// map      edge type mappings
+// dst      edge output
 
-__kernel void getEdges(__global const uchar * mapptr, int map_step, int map_offset,
-                       __global uchar * dst, int dst_step, int dst_offset, int rows, int cols)
+__kernel void getEdges(__global const uchar *mapptr, int map_step, int map_offset, int rows, int cols,
+                       __global uchar *dst, int dst_step, int dst_offset)
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y0 = get_global_id(1) * PIX_PER_WI;
 
-    if (y < rows && x < cols)
+    #pragma unroll
+    for (int y = y0; y < min(y0 + PIX_PER_WI, rows); ++y)
     {
-        int map_index = mad24(map_step, y + 1, mad24(x + 1, (int)sizeof(int), map_offset));
-        int dst_index = mad24(dst_step, y, x + dst_offset);
+        int map_index = mad24(map_step, y, mad24(x, (int)sizeof(int), map_offset));
+        int dst_index = mad24(dst_step, y, x) + dst_offset;
 
         __global const int * map = (__global const int *)(mapptr + map_index);
-
         dst[dst_index] = (uchar)(-(map[0] >> 1));
     }
 }
 
-#endif
+#endif
\ No newline at end of file