diff --git a/modules/ocl/src/haar.cpp b/modules/ocl/src/haar.cpp index 25d376a4e1bfa3a9df10c8da4239352f117a31d1..e334ad913bf8e25e60055e2ff72e0bc5e441e21c 100644 --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE)) { - //setup local group size - localThreads[0] = 8; - localThreads[1] = 16; + //setup local group size for "pixel step" = 1 + localThreads[0] = 16; + localThreads[1] = 32; localThreads[2] = 1; - //init maximal number of workgroups + //calc maximal number of workgroups int WGNumX = 1+(sizev[0].width /(localThreads[0])); int WGNumY = 1+(sizev[0].height/(localThreads[1])); int WGNumZ = loopcount; - int WGNum = 0; //accurate number of non -empty workgroups + int WGNumTotal = 0; //accurate number of non-empty workgroups + int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2 oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U); { cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status); @@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS if(gx>=(Width-cascade->orig_window_size.width)) continue; // no data to process + if(scaleinfo[z].factor<=2) + { + WGNumSampled++; + } // save no-empty workgroup info into array - pWGInfo[WGNum].s[0] = scaleinfo[z].width_height; - pWGInfo[WGNum].s[1] = (gx << 16) | gy; - pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff; - memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float)); - WGNum++; + pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height; + pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy; + pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff; + memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float)); + WGNumTotal++; } } } @@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS pWGInfo = NULL; } - // setup global sizes to have linear array of workgroups with WGNum size - globalThreads[0] = localThreads[0]*WGNum; - globalThreads[1] = localThreads[1]; - globalThreads[2] = 1; - #define NODE_SIZE 12 - // pack node info to have less memory loads + // pack node info to have less memory loads on the device side oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U); { cl_int status; @@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width); options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based); - options += format(" -D LSx=%d",localThreads[0]); - options += format(" -D LSy=%d",localThreads[1]); options += format(" -D SPLITNODE=%d",splitnode); options += format(" -D SPLITSTAGE=%d",splitstage); options += format(" -D OUTPUTSZ=%d",outputsz); @@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS // init candiate global count by 0 int pattern = 0; openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL)); - // execute face detector - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str()); + + if(WGNumTotal>WGNumSampled) + {// small images and each pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 1; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled); + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + string options1 = options; + options1 += format(" -D PIXEL_STEP=%d",pixelstep); + options1 += format(" -D WGSTART=%d",WGNumSampled); + options1 += format(" -D LSx=%d",LS[0]); + options1 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str()); + } + if(WGNumSampled>0) + {// large images each 4th pixel is processed + // setup global sizes to have linear array of workgroups with WGNum size + int pixelstep = 2; + size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1}; + globalThreads[0] = LS[0]*WGNumSampled; + globalThreads[1] = LS[1]; + globalThreads[2] = 1; + string options2 = options; + options2 += format(" -D PIXEL_STEP=%d",pixelstep); + options2 += format(" -D WGSTART=%d",0); + options2 += format(" -D LSx=%d",LS[0]); + options2 += format(" -D LSy=%d",LS[1]); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str()); + } //read candidate buffer back and put it into host list openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); assert(candidate[0]> 16)&0xFFFF; int GroupY = (WGInfo.y >> 0 )& 0xFFFF; int Width = (WGInfo.x >> 16)&0xFFFF; @@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int ImgOffset = WGInfo.z; float ScaleFactor = as_float(WGInfo.w); -#define DATA_SIZE_X (LSx+WND_SIZE_X) -#define DATA_SIZE_Y (LSy+WND_SIZE_Y) +#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X) +#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y) #define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y) local int SumL[DATA_SIZE]; @@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 info1 = p; int4 info2 = pq; - { - int xl = lid_x; - int yl = lid_y; + // calc processed ROI coordinate in local mem + int xl = lid_x*PIXEL_STEP; + int yl = lid_y*PIXEL_STEP; + + {// calc variance_norm_factor for all stages int OffsetLocal = yl * DATA_SIZE_X + xl; int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl); @@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked( int result = (1.0f>0.0f); for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ ) - {// iterate until candidate is exist + {// iterate until candidate is valid float stage_sum = 0.0f; __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*) ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier)); + int lcl_off = (yl*DATA_SIZE_X)+(xl); int stagecount = stageinfo->count; float stagethreshold = stageinfo->threshold; - int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x); for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ ) { // simple macro to extract shorts from int @@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked( int4 n1 = pN[1]; int4 n2 = pN[2]; float nodethreshold = as_float(n2.y) * variance_norm_factor; - // calc sum of intensity pixels according to node information + // calc sum of intensity pixels according to classifier node information float classsum = (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) + (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) + @@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked( int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info if(index