diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp
index 25d376a4e1bfa3a9df10c8da4239352f117a31d1..e334ad913bf8e25e60055e2ff72e0bc5e441e21c 100644
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
 
         if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
         {
-            //setup local group size
-            localThreads[0] = 8;
-            localThreads[1] = 16;
+            //setup local group size for "pixel step" = 1
+            localThreads[0] = 16;
+            localThreads[1] = 32;
             localThreads[2] = 1;
 
-            //init maximal number of workgroups
+            //calc maximal number of workgroups
             int WGNumX = 1+(sizev[0].width /(localThreads[0]));
             int WGNumY = 1+(sizev[0].height/(localThreads[1]));
             int WGNumZ = loopcount;
-            int WGNum = 0; //accurate number of non -empty workgroups
+            int WGNumTotal = 0; //accurate number of non-empty workgroups
+            int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
             oclMat      oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
             {
                 cl_int4*    pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
@@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                             if(gx>=(Width-cascade->orig_window_size.width))
                                 continue; // no data to process
 
+                            if(scaleinfo[z].factor<=2)
+                            {
+                                WGNumSampled++;
+                            }
                             // save no-empty workgroup info into array
-                            pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
-                            pWGInfo[WGNum].s[1] = (gx << 16) | gy;
-                            pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
-                            memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
-                            WGNum++;
+                            pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
+                            pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
+                            pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
+                            memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
+                            WGNumTotal++;
                         }
                     }
                 }
@@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
                 pWGInfo = NULL;
             }
 
-            // setup global sizes to have linear array of workgroups with WGNum size
-            globalThreads[0] = localThreads[0]*WGNum;
-            globalThreads[1] = localThreads[1];
-            globalThreads[2] = 1;
-
 #define NODE_SIZE 12
-            // pack node info to have less memory loads
+            // pack node info to have less memory loads on the device side
             oclMat  oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
             {
                 cl_int  status;
@@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
             options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
             options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
             options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
-            options += format(" -D LSx=%d",localThreads[0]);
-            options += format(" -D LSy=%d",localThreads[1]);
             options += format(" -D SPLITNODE=%d",splitnode);
             options += format(" -D SPLITSTAGE=%d",splitstage);
             options += format(" -D OUTPUTSZ=%d",outputsz);
@@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
             // init candiate global count by 0
             int pattern = 0;
             openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
-            // execute face detector
-            openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+
+            if(WGNumTotal>WGNumSampled)
+            {// small images and each pixel is processed
+                // setup global sizes to have linear array of workgroups with WGNum size
+                int     pixelstep = 1;
+                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
+                globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
+                globalThreads[1] = LS[1];
+                globalThreads[2] = 1;
+                string options1 = options;
+                options1 += format(" -D PIXEL_STEP=%d",pixelstep);
+                options1 += format(" -D WGSTART=%d",WGNumSampled);
+                options1 += format(" -D LSx=%d",LS[0]);
+                options1 += format(" -D LSy=%d",LS[1]);
+                // execute face detector
+                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
+            }
+            if(WGNumSampled>0)
+            {// large images each 4th pixel is processed
+                // setup global sizes to have linear array of workgroups with WGNum size
+                int     pixelstep = 2;
+                size_t  LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
+                globalThreads[0] = LS[0]*WGNumSampled;
+                globalThreads[1] = LS[1];
+                globalThreads[2] = 1;
+                string options2 = options;
+                options2 += format(" -D PIXEL_STEP=%d",pixelstep);
+                options2 += format(" -D WGSTART=%d",0);
+                options2 += format(" -D LSx=%d",LS[0]);
+                options2 += format(" -D LSy=%d",LS[1]);
+                // execute face detector
+                openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
+            }
             //read candidate buffer back and put it into host list
             openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
             assert(candidate[0]<outputsz);
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
index 980e85dd27cba7ac7264800dc1fcbf9b8d2746f5..d6e5fb9ba3e3e11556d3244d6a6c30f7b3695664 100644
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     )
 
 {
-// this version used information provided for each workgroup
-// no empty WG
     int     gid = (int)get_group_id(0);
     int     lid_x = (int)get_local_id(0);
     int     lid_y = (int)get_local_id(1);
     int     lid = lid_y*LSx+lid_x;
-    int4    WGInfo = pWGInfo[gid];
+    int4    WGInfo = pWGInfo[WGSTART+gid];
     int     GroupX = (WGInfo.y >> 16)&0xFFFF;
     int     GroupY = (WGInfo.y >> 0 )& 0xFFFF;
     int     Width  = (WGInfo.x >> 16)&0xFFFF;
@@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     int     ImgOffset = WGInfo.z;
     float   ScaleFactor = as_float(WGInfo.w);
 
-#define DATA_SIZE_X (LSx+WND_SIZE_X)
-#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
+#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
 #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
 
     local int SumL[DATA_SIZE];
@@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     int4    info1 = p;
     int4    info2 = pq;
 
-    {
-        int     xl = lid_x;
-        int     yl = lid_y;
+    // calc processed ROI coordinate in local mem
+    int     xl = lid_x*PIXEL_STEP;
+    int     yl = lid_y*PIXEL_STEP;
+
+    {// calc variance_norm_factor for all stages
         int     OffsetLocal =          yl * DATA_SIZE_X +         xl;
         int     OffsetGlobal = (GroupY+yl)* pixelstep   + (GroupX+xl);
 
@@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
 
     int result = (1.0f>0.0f);
     for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
-    {// iterate until candidate is exist
+    {// iterate until candidate is valid
         float   stage_sum = 0.0f;
         __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
             ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+        int     lcl_off = (yl*DATA_SIZE_X)+(xl);
         int stagecount = stageinfo->count;
         float stagethreshold = stageinfo->threshold;
-        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
         for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
         {
         // simple macro to extract shorts from int
@@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
             int4    n1 = pN[1];
             int4    n2 = pN[2];
             float   nodethreshold  = as_float(n2.y) * variance_norm_factor;
-            // calc sum of intensity pixels according to node information
+            // calc sum of intensity pixels according to classifier node information
             float classsum =
                 (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
                 (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
@@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
         int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
         if(index<OUTPUTSZ)
         {
-            int     x = GroupX+lid_x;
-            int     y = GroupY+lid_y;
+            int     x = GroupX+xl;
+            int     y = GroupY+yl;
             int4 candidate_result;
             candidate_result.x = convert_int_rtn(x*ScaleFactor);
             candidate_result.y = convert_int_rtn(y*ScaleFactor);