Merge pull request #1731 from perping:2.4_haar

2767be9a · Andrey Pavlenko · OpenCV Buildbot · 9d263db8 · af77111c · 2767be9a
3 changed file
--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
@@ -1059,11 +1059,11 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
        {
            sz = sizev[i];
            factor = scalev[i];
-            int ystep = cvRound(std::max(2., factor));
-            int equRect_x = (int)(factor * gcascade->p0 + 0.5);
-            int equRect_y = (int)(factor * gcascade->p1 + 0.5);
-            int equRect_w = (int)(factor * gcascade->p3 + 0.5);
-            int equRect_h = (int)(factor * gcascade->p2 + 0.5);
+            double ystep = std::max(2., factor);
+            int equRect_x = cvRound(factor * gcascade->p0);
+            int equRect_y = cvRound(factor * gcascade->p1);
+            int equRect_w = cvRound(factor * gcascade->p3);
+            int equRect_h = cvRound(factor * gcascade->p2);
            p[i].s[0] = equRect_x;
            p[i].s[1] = equRect_y;
            p[i].s[2] = equRect_x + equRect_w;
@@ -1676,9 +1676,9 @@ void cv::ocl::OclCascadeClassifierBuf::CreateFactorRelatedBufs(
        {
            sz = sizev[i];
            factor = scalev[i];
-            int ystep = cvRound(std::max(2., factor));
-            int width = (cols - 1 - sz.width  + ystep - 1) / ystep;
-            int height = (rows - 1 - sz.height + ystep - 1) / ystep;
+            double ystep = cv::max(2.,factor);
+            int width = cvRound((cols - 1 - sz.width  + ystep - 1) / ystep);
+            int height = cvRound((rows - 1 - sz.height + ystep - 1) / ystep);
            int grpnumperline = (width + localThreads[0] - 1) / localThreads[0];
            int totalgrp = ((height + localThreads[1] - 1) / localThreads[1]) * grpnumperline;


--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -11,6 +11,7 @@
 //    Jia Haipeng, jiahaipeng95@gmail.com
 //    Nathan, liujun@multicorewareinc.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -320,7 +321,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                int glb_x = grpoffx + (lcl_x<<2);
                int glb_y = grpoffy + lcl_y;

-                int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
+                int glb_off = mad24(min(glb_y, height + WINDOWSIZE - 1),pixelstep,glb_x);
                int4 data = *(__global int4*)&sum[glb_off];
                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);

@@ -420,12 +421,23 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa

                result = (stage_sum >= stagethreshold);
            }
-
-            if(result && (x < width) && (y < height))
+            if(factor < 2)
+            {
+                if(result && lclidx %2 ==0 && lclidy %2 ==0 )
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
+                }
+            }
+            else
            {
-                int queueindex = atomic_inc(lclcount);
-                lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
-                lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                if(result)
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                    lcloutindex[(queueindex<<1)+1] = as_int((float)variance_norm_factor);
+                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
            int queuecount  = lclcount[0];
@@ -548,11 +560,27 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
                temp = glboutindex[0];
                int4 candidate_result;
-                candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
-                candidate_result.x = convert_int_rtn(x*factor);
-                candidate_result.y = convert_int_rtn(y*factor);
+                candidate_result.zw = (int2)convert_int_rte(factor*20.f);
+                candidate_result.x = convert_int_rte(x*factor);
+                candidate_result.y = convert_int_rte(y*factor);
                atomic_inc(glboutindex);
-                candidate[outputoff+temp+lcl_id] = candidate_result;
+
+                int i = outputoff+temp+lcl_id;
+                if(candidate[i].z == 0)
+                {
+                    candidate[i] = candidate_result;
+                }
+                else
+                {
+                    for(i=i+1;;i++)
+                    {
+                        if(candidate[i].z == 0)
+                        {
+                            candidate[i] = candidate_result;
+                            break;
+                        }
+                    }
+                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)

--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -18,6 +18,7 @@
 //    Wu Xinglong, wxl370@126.com
 //    Sen Liu, swjtuls1987@126.com
 //    Peng Xiao, pengxiao@outlook.com
+//    Erping Pang, erping@multicorewareinc.com
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -141,7 +142,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
        int totalgrp = scaleinfo1.y & 0xffff;
        float factor = as_float(scaleinfo1.w);
        float correction_t = correction[scalei];
-        int ystep = (int)(max(2.0f, factor) + 0.5f);
+        float ystep = max(2.0f, factor);

        for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
        {
@@ -150,8 +151,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
            int grpidx = grploop - mul24(grpidy, grpnumperline);
            int ix = mad24(grpidx, grpszx, lclidx);
            int iy = mad24(grpidy, grpszy, lclidy);
-            int x = ix * ystep;
-            int y = iy * ystep;
+            int x = round(ix * ystep);
+            int y = round(iy * ystep);
            lcloutindex[lcl_id] = 0;
            lclcount[0] = 0;
            int nodecounter;
@@ -242,7 +243,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(

                barrier(CLK_LOCAL_MEM_FENCE);

-                if (result && (ix < width) && (iy < height))
+                if (result)
                {
                    int queueindex = atomic_inc(lclcount);
                    lcloutindex[queueindex] = (y << 16) | x;
@@ -257,10 +258,26 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                    int y = (temp & (int)0xffff0000) >> 16;
                    temp = atomic_inc(glboutindex);
                    int4 candidate_result;
-                    candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
+                    candidate_result.zw = (int2)convert_int_rte(factor * 20.f);
                    candidate_result.x = x;
                    candidate_result.y = y;
-                    candidate[outputoff + temp + lcl_id] = candidate_result;
+
+                    int i = outputoff+temp+lcl_id;
+                    if(candidate[i].z == 0)
+                    {
+                        candidate[i] = candidate_result;
+                    }
+                    else
+                    {
+                        for(i=i+1;;i++)
+                        {
+                            if(candidate[i].z == 0)
+                            {
+                                candidate[i] = candidate_result;
+                                break;
+                            }
+                        }
+                    }
                }

                barrier(CLK_LOCAL_MEM_FENCE);
@@ -283,7 +300,7 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
        tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
    }

-    t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
+    t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
    counter += nodenum;
 #pragma unroll