LBP features: GPU representation

a743eca0 · Marina Kolpakova · 55567582 · a743eca0 · a743eca0 · a743eca0
4 changed file
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1460,6 +1460,7 @@ private:
    GpuMat nodes_mat;
    GpuMat leaves_mat;
    GpuMat subsets_mat;
+    GpuMat features_mat;

    // current integral image
    GpuMat integral;

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -99,10 +99,9 @@ cv::gpu::CascadeClassifier_GPU_LBP::~CascadeClassifier_GPU_LBP()

 void cv::gpu::CascadeClassifier_GPU_LBP::preallocateIntegralBuffer(cv::Size desired)
 {
-    integral.create(desired.width + 1, desired.height + 1, CV_32FC1);
+    integral.create(desired.width + 1, desired.height + 1, CV_32SC1);
 }

-
 bool cv::gpu::CascadeClassifier_GPU_LBP::empty() const
 {
    return stage_mat.empty();
@@ -132,6 +131,8 @@ bool cv::gpu::CascadeClassifier_GPU_LBP::load(const string& classifierAsXml)
 #define GPU_CC_WEAK_CLASSIFIERS     "weakClassifiers"
 #define GPU_CC_INTERNAL_NODES       "internalNodes"
 #define GPU_CC_LEAF_VALUES          "leafValues"
+#define GPU_CC_FEATURES             "features"
+#define GPU_CC_RECT                 "rect"

 bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
 {
@@ -225,6 +226,22 @@ bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
                cl_leaves.push_back((float)*iIt);
        }
    }
+    fn = root[GPU_CC_FEATURES];
+    if( fn.empty() )
+        return false;
+    std::vector<char> features;
+    features.reserve(fn.size() * 4);
+    FileNodeIterator f_it = fn.begin(), f_end = fn.end();
+    for (; f_it != f_end; ++f_it)
+    {
+        FileNode rect = fn[GPU_CC_RECT];
+        FileNodeIterator r_it = rect.begin();
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+        features.push_back(saturate_cast<uchar>((int)*(r_it++)));
+    }
+
    // copy data structures on gpu
    stage_mat = cv::gpu::GpuMat(1, (int)stages.size() * sizeof(Stage), CV_8UC1);
    stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, &(stages[0]) ));
@@ -241,6 +258,9 @@ bool CascadeClassifier_GPU_LBP::read(const FileNode &root)
    subsets_mat = cv::gpu::GpuMat(1, (int)subsets.size(), CV_32SC1);
    stage_mat.upload(cv::Mat(subsets));

+    features_mat = cv::gpu::GpuMat(1, (int)features.size(), CV_8UC1);
+    features_mat.upload(cv::Mat(features));
+
    return true;
 }

@@ -270,22 +290,25 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
+        void cascadeClassify(const DevMem2Db stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db features,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors = 4, cudaStream_t stream = 0);
    }
 }}}

 int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, GpuMat& scaledImageBuffer, GpuMat& objects, double scaleFactor, int minNeighbors /*, Size minSize=Size()*/)
 {
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );
-    CV_Assert(empty());
+    CV_Assert(!empty());

    const int defaultObjSearchNum = 100;

-    if( !objects.empty() && objects.depth() == CV_32S)
-        objects.reshape(4, 1);
-    else
-        objects.create(1 , defaultObjSearchNum, CV_32SC4);
+    // if( !objects.empty() && objects.depth() == CV_32S)
+    //     objects.reshape(4, 1);
+    // else
+    //     objects.create(1 , defaultObjSearchNum, CV_32SC4);
+
+    // temp solution
+    objects.create(image.rows, image.cols, CV_32SC4);

    scaledImageBuffer.create(image.size(), image.type());

@@ -302,14 +325,13 @@ int cv::gpu::CascadeClassifier_GPU_LBP::detectMultiScale(const GpuMat& image, Gp
        // TODO: min max object sizes cheching
        cv::gpu::resize(image, scaledImageBuffer, scaledImageSize, 0, 0, INTER_NEAREST);
        //prepare image for evaluation
-        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32FC1);
+        integral.create(cv::Size(scaledImageSize.width + 1, scaledImageSize.height + 1), CV_32SC1);
        cv::gpu::integral(scaledImageBuffer, integral);

        int step = (factor <= 2.) + 1;
-        int stripCount = 1, stripSize = processingRectSize.height;

-        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat,
-         integral, processingRectSize.width, processingRectSize.height, step, subsetSize, objects, minNeighbors);
+        cv::gpu::device::lbp::cascadeClassify(stage_mat, trees_mat, nodes_mat, leaves_mat, subsets_mat, features_mat,
+         integral, processingRectSize.width, processingRectSize.height, windowSize.width, windowSize.height, scaleFactor, step, subsetSize, objects, minNeighbors);
    }
    // TODO: reject levels


--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -46,13 +46,14 @@ namespace cv { namespace gpu { namespace device
 {
    namespace lbp
    {
-        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, float step, int subsetSize, DevMem2D_<int4> objects)
+        __global__ void lbp_classify(const DevMem2D_< ::cv::gpu::device::Stage> stages, const DevMem2Di trees, const DevMem2D_< ::cv::gpu::device::ClNode> nodes,
+            const DevMem2Df leaves, const DevMem2Di subsets,
+            const DevMem2D_<uchar4> features, const DevMem2Di integral, float step, int subsetSize, DevMem2D_<int4> objects, float scale, int clWidth, int clHeight)
        {
-            unsigned int x = threadIdx.x;
-            unsigned int y = blockIdx.x;
+            unsigned int x = threadIdx.x * step;
+            unsigned int y = blockIdx.x  * step;
            int nodeOfs = 0, leafOfs = 0;
-            ::cv::gpu::device::Feature feature;
+            ::cv::gpu::device::Feature evaluator;

            for (int s = 0; s < stages.cols; s++ )
            {
@@ -61,7 +62,9 @@ namespace cv { namespace gpu { namespace device
                for (int w = 0; w < stage.ntrees; w++)
                {
                    ::cv::gpu::device::ClNode node = nodes(0, nodeOfs);
-                    char c = feature();// TODO: inmplement it
+                    uchar4 feature = features(0, node.featureIdx);
+
+                    uchar c = evaluator(y, x, feature, integral);
                    const int subsetIdx = (nodeOfs * subsetSize);
                    int idx = subsetIdx + ((c >> 5) & ( 1 << (c & 31)) ? leafOfs : leafOfs + 1);
                    sum += leaves(0, subsets(0, idx) );
@@ -70,21 +73,27 @@ namespace cv { namespace gpu { namespace device
                }

                if (sum < stage.threshold)
-                    return; // nothing matched
-                return;//mathed
+                    return;
            }
-
+            int4 rect;
+            rect.x = roundf(x * scale);
+            rect.y = roundf(y * scale);
+            rect.z = roundf(clWidth * scale);
+            rect.w = roundf(clHeight * scale);
+            objects(blockIdx.x, threadIdx.x) = rect;
        }

-        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db nodes, const DevMem2Df leaves, const DevMem2Di subsets,
-            const DevMem2Db integral, int workWidth, int workHeight, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
+        void cascadeClassify(const DevMem2Db bstages, const DevMem2Di trees, const DevMem2Db bnodes, const DevMem2Df leaves, const DevMem2Di subsets, const DevMem2Db bfeatures,
+            const DevMem2Di integral, int workWidth, int workHeight, int clWidth, int clHeight, float scale, int step, int subsetSize, DevMem2D_<int4> objects, int minNeighbors, cudaStream_t stream)
        {
            printf("CascadeClassify");
            int blocks = ceilf(workHeight / (float)step);
            int threads = ceilf(workWidth / (float)step);
            DevMem2D_< ::cv::gpu::device::Stage> stages = DevMem2D_< ::cv::gpu::device::Stage>(bstages);
+            DevMem2D_<uchar4> features = (DevMem2D_<uchar4>)bfeatures;
+            DevMem2D_< ::cv::gpu::device::ClNode> nodes = DevMem2D_< ::cv::gpu::device::ClNode>(bnodes);

-            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, integral, step, subsetSize, objects);
+            lbp_classify<<<blocks, threads>>>(stages, trees, nodes, leaves, subsets, features, integral, step, subsetSize, objects, scale, clWidth, clHeight);
        }
    }
 }}}
\ No newline at end of file
--- a/modules/gpu/src/opencv2/gpu/device/lbp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/lbp.hpp
@@ -82,9 +82,48 @@ namespace cv { namespace gpu { namespace device {
    {
        __device__ __forceinline__ Feature(const Feature& other) {(void)other;}
        __device__ __forceinline__ Feature() {}
-        __device__ __forceinline__ char operator() ()//(volatile int* ptr, int offset)
+
+        //feature as uchar x, y - left top, z,w - right bottom
+        __device__ __forceinline__ uchar operator() (unsigned int y, unsigned int x, uchar4 feature, const DevMem2Di integral) const
        {
-            return char(0);
+            int x_off = 2 * feature.z;
+            int y_off = 2 * feature.w;
+
+            // load feature key points
+            int anchors[16];
+            anchors[0]  = integral(y + feature.y, x + feature.x);
+            anchors[1]  = integral(y + feature.y, x + feature.z);
+            anchors[2]  = integral(y + feature.y, x + x_off + feature.x);
+            anchors[3]  = integral(y + feature.y, x + x_off + feature.z);
+
+            anchors[4]  = integral(y + feature.w, x + feature.x);
+            anchors[5]  = integral(y + feature.w, x + feature.z);
+            anchors[6]  = integral(y + feature.w, x + x_off + feature.x);
+            anchors[7]  = integral(y + feature.w, x + x_off + feature.z);
+
+            anchors[8]  = integral(y + y_off + feature.y, x + feature.x);
+            anchors[9]  = integral(y + y_off + feature.y, x + feature.z);
+            anchors[10] = integral(y + y_off + feature.y, x + x_off + feature.x);
+            anchors[11] = integral(y + y_off + feature.y, x + x_off + feature.z);
+
+            anchors[12] = integral(y + y_off + feature.w, x + feature.x);
+            anchors[13] = integral(y + y_off + feature.w, x + feature.z);
+            anchors[14] = integral(y + y_off + feature.w, x + x_off + feature.x);
+            anchors[15] = integral(y + y_off + feature.w, x + x_off + feature.z);
+
+            // calculate feature
+            int sum = anchors[5] - anchors[6] - anchors[9] + anchors[10];
+
+            uchar response = (( (anchors[ 0] - anchors[ 1] - anchors[ 4] + anchors[ 5]) >= sum )? 128 : 0)
+                            |(( (anchors[ 1] - anchors[ 2] - anchors[ 5] + anchors[ 6]) >= sum )? 64  : 0)
+                            |(( (anchors[ 2] - anchors[ 3] - anchors[ 6] + anchors[ 7]) >= sum )? 32  : 0)
+                            |(( (anchors[ 6] - anchors[ 7] - anchors[10] + anchors[11]) >= sum )? 16  : 0)
+                            |(( (anchors[10] - anchors[11] - anchors[14] + anchors[15]) >= sum )? 8   : 0)
+                            |(( (anchors[ 9] - anchors[10] - anchors[13] + anchors[14]) >= sum )? 4   : 0)
+                            |(( (anchors[ 8] - anchors[ 9] - anchors[12] + anchors[13]) >= sum )? 2   : 0)
+                            |(( (anchors[ 4] - anchors[ 5] - anchors[ 8] + anchors[ 9]) >= sum )? 1   : 0);
+
+            return response;
        }

    };