problem with the supported data matrices types is fixed

minor fixes in CvGBTrees, its test and sample

problem with the supported data matrices types is fixed
minor fixes in CvGBTrees, its test and sample
15f7918e · P. Druzhkov · 6a03be26 · 15f7918e · 15f7918e · 15f7918e
4 changed file
--- a/modules/ml/include/opencv2/ml/ml.hpp
+++ b/modules/ml/include/opencv2/ml/ml.hpp
@@ -1832,6 +1832,7 @@ protected:
    // RESULT
    */
    virtual void read_params( CvFileStorage* fs, CvFileNode* fnode );
+	int get_len(const CvMat* mat) const;

    
    CvDTreeTrainData* data;

--- a/modules/ml/src/gbt.cpp
+++ b/modules/ml/src/gbt.cpp
@@ -20,23 +20,18 @@ string ToString(int i)
    return tmp.str();
 }

-//===========================================================================
-int get_len(const CvMat* mat)
-{
-    return (mat->cols > mat->rows) ? mat->cols : mat->rows;
-}

 //===========================================================================
 //----------------------------- CvGBTreesParams -----------------------------
 //===========================================================================

 CvGBTreesParams::CvGBTreesParams() 
-            : CvDTreeParams( 3, 10, 0, true, 10, 0, false, false, 0 )
+            : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
 {
-    weak_count = 50;
+    weak_count = 200;
    loss_function_type = CvGBTrees::SQUARED_LOSS;
-    subsample_portion = 1.0f;
-    shrinkage = 1.0f;
+    subsample_portion = 0.8f;
+    shrinkage = 0.01f;
 }

 //===========================================================================
@@ -44,7 +39,7 @@ CvGBTreesParams::CvGBTreesParams()
 CvGBTreesParams::CvGBTreesParams( int _loss_function_type, int _weak_count, 
                         float _shrinkage, float _subsample_portion, 
                         int _max_depth, bool _use_surrogates )
-            : CvDTreeParams( 3, 10, 0, true, 10, 0, false, false, 0 )
+            : CvDTreeParams( 3, 10, 0, false, 10, 0, false, false, 0 )
 {
    loss_function_type = _loss_function_type;
    weak_count = _weak_count;
@@ -75,18 +70,25 @@ CvGBTrees::CvGBTrees()

 //===========================================================================

+int CvGBTrees::get_len(const CvMat* mat) const
+{
+    return (mat->cols > mat->rows) ? mat->cols : mat->rows;
+}
+
+//===========================================================================
+
 void CvGBTrees::clear()
 {
    if( weak )
    {
        CvSeqReader reader;
        CvSlice slice = CV_WHOLE_SEQ;
-        int weak_count = cvSliceLength( slice, weak[class_count-1] );
        CvDTree* tree;

        //data->shared = false;
        for (int i=0; i<class_count; ++i)
        {
+			int weak_count = cvSliceLength( slice, weak[i] );
            if ((weak[i]) && (weak_count))
            {
                cvStartReadSeq( weak[i], &reader ); 
@@ -192,9 +194,19 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
    bool is_regression = problem_type();

    clear();
-    int len = get_len(_responses);
+    /*
+      n - count of samples
+      m - count of variables
+    */
+    int n = _train_data->rows;
+    int m = _train_data->cols;
+    if (_tflag != CV_ROW_SAMPLE)
+    {
+        int tmp;
+        CV_SWAP(n,m,tmp);
+    }

-    CvMat* new_responses = cvCreateMat( len, 1, CV_32F);
+    CvMat* new_responses = cvCreateMat( n, 1, CV_32F);
    cvZero(new_responses);

    data = new CvDTreeTrainData( _train_data, _tflag, new_responses, _var_idx,
@@ -204,88 +216,118 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
        missing = cvCreateMat(_missing_mask->rows, _missing_mask->cols,
                              _missing_mask->type);
        cvCopy( _missing_mask, missing);
-    }    
+    }

-    orig_response = cvCreateMat( _responses->rows, _responses->cols,
-                                 _responses->type );
-    cvCopy( _responses, orig_response);
-    orig_response->step = CV_ELEM_SIZE(_responses->type);
+    orig_response = cvCreateMat( 1, n, CV_32F );
+	int step = (_responses->cols > _responses->rows) ? 1 : _responses->step / CV_ELEM_SIZE(_responses->type);
+    switch (CV_MAT_TYPE(_responses->type))
+    {
+        case CV_32FC1:
+		{
+			for (int i=0; i<n; ++i)
+                orig_response->data.fl[i] = _responses->data.fl[i*step];
+		}; break;
+        case CV_32SC1:
+        {
+            for (int i=0; i<n; ++i)
+                orig_response->data.fl[i] = (float) _responses->data.i[i*step];
+        }; break;
+        default:
+            CV_Error(CV_StsUnmatchedFormats, "Response should be a 32fC1 or 32sC1 vector.");
+    }

-	/*
    if (!is_regression)
    {
-        int max_label = -1;
-        for (int i=0; i<get_len(orig_response); ++i)
-            if (max_label < orig_response->data.fl[i])
-                max_label = int(orig_response->data.fl[i]);
-        max_label++;
-        class_labels = cvCreateMat(1, max_label, CV_32S);
-        cvZero(class_labels);
-        for (int i=0; i<get_len(orig_response); ++i)
-            class_labels->data.i[int(orig_response->data.fl[i])] = 1;
        class_count = 0;
-        for (int i=0; i<max_label; ++i)
-            if (class_labels->data.i[i])
-                class_labels->data.i[i] = ++class_count;
+        unsigned char * mask = new unsigned char[n];
+        memset(mask, 0, n);
+        // compute the count of different output classes
+        for (int i=0; i<n; ++i)
+            if (!mask[i])
+            {
+                class_count++;
+                for (int j=i; j<n; ++j)
+                    if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
+                        mask[j] = 1;
+            }
+        delete[] mask;
+    
+        class_labels = cvCreateMat(1, class_count, CV_32S);
+        class_labels->data.i[0] = int(orig_response->data.fl[0]);
+        int j = 1;
+        for (int i=1; i<n; ++i)
+        {
+            int k = 0;
+            while ((int(orig_response->data.fl[i]) - class_labels->data.i[k]) && (k<j))
+                k++;
+            if (k == j)
+            {
+                class_labels->data.i[k] = int(orig_response->data.fl[i]);
+                j++;
+            }
+        }
    }
-	*/
-	if (!is_regression)
-	{
-		class_count = 0;
-		unsigned char * mask = new unsigned char[get_len(orig_response)];
-		for (int i=0; i<get_len(orig_response); ++i)
-			mask[i] = 0;
-		for (int i=0; i<get_len(orig_response); ++i)
-			if (!mask[i])
-			{
-				class_count++;
-				for (int j=i; j<get_len(orig_response); ++j)
-					if (int(orig_response->data.fl[j]) == int(orig_response->data.fl[i]))
-						mask[j] = 1;
-			}
-		delete[] mask;
-	
-		class_labels = cvCreateMat(1, class_count, CV_32S);
-		class_labels->data.i[0] = int(orig_response->data.fl[0]);
-		int j = 1;
-		for (int i=1; i<get_len(orig_response); ++i)
-		{
-			int k = 0;
-			while ((int(orig_response->data.fl[i]) - class_labels->data.i[k]) && (k<j))
-				k++;
-			if (k == j)
-			{
-				class_labels->data.i[k] = int(orig_response->data.fl[i]);
-				j++;
-			}
-		}
-	}

+    // inside gbt learning proccess only regression decision trees are built
    data->is_classifier = false;

+    // preproccessing sample indices
    if (_sample_idx)
    {
-        sample_idx = cvCreateMat( _sample_idx->rows, _sample_idx->cols,
-                                  _sample_idx->type );
-        cvCopy( _sample_idx, sample_idx);
-        icvSortFloat(sample_idx->data.fl, get_len(sample_idx), 0);
+        int sample_idx_len = get_len(_sample_idx);
+        
+        switch (CV_ELEM_SIZE(_sample_idx->type))
+        {
+            case CV_32SC1:
+            {
+                sample_idx = cvCreateMat( 1, sample_idx_len, CV_32S );
+                for (int i=0; i<sample_idx_len; ++i)
+					sample_idx->data.i[i] = _sample_idx->data.i[i];
+            } break;
+            case CV_8S:
+            case CV_8U:
+            {
+                int active_samples_count = 0;
+                for (int i=0; i<sample_idx_len; ++i)
+                    active_samples_count += int( _sample_idx->data.ptr[i] );
+                sample_idx = cvCreateMat( 1, active_samples_count, CV_32S );
+                active_samples_count = 0;
+                for (int i=0; i<sample_idx_len; ++i)
+                    if (int( _sample_idx->data.ptr[i] ))
+                        sample_idx->data.i[active_samples_count++] = i;
+                    
+            } break;
+            default: CV_Error(CV_StsUnmatchedFormats, "_sample_idx should be a 32sC1, 8sC1 or 8uC1 vector.");
+        }
+        icvSortFloat(sample_idx->data.fl, sample_idx_len, 0);
    }
    else
    {
-        int n = (_tflag == CV_ROW_SAMPLE) ? _train_data->rows
-                                          : _train_data->cols;
        sample_idx = cvCreateMat( 1, n, CV_32S );
        for (int i=0; i<n; ++i)
            sample_idx->data.i[i] = i;
    }

-    sum_response = cvCreateMat(class_count, len, CV_32F);
-    sum_response_tmp = cvCreateMat(class_count, len, CV_32F);
+    sum_response = cvCreateMat(class_count, n, CV_32F);
+    sum_response_tmp = cvCreateMat(class_count, n, CV_32F);
    cvZero(sum_response);

    delta = 0.0f;
+    /*
+      in the case of a regression problem the initial guess (the zero term
+      in the sum) is set to the mean of all the training responses, that is
+      the best constant model
+    */
    if (is_regression) base_value = find_optimal_value(sample_idx);
+    /*
+      in the case of a classification problem the initial guess (the zero term
+      in the sum) is set to zero for all the trees sequences
+    */
    else base_value = 0.0f;
+    /*
+      current predicition on all training samples is set to be
+      equal to the base_value
+    */
    cvSet( sum_response, cvScalar(base_value) );

    weak = new pCvSeq[class_count];
@@ -299,10 +341,8 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
    // subsample params and data
    rng = &cv::theRNG();

-    int samples_count = get_len(sample_idx);
+	int samples_count = get_len(sample_idx);

-    //if ( params.subsample_portion > 1) params.subsample_portion = 1;
-    //if ( params.subsample_portion < 0) params.subsample_portion = 1;
    params.subsample_portion = params.subsample_portion <= FLT_EPSILON || 
        1 - params.subsample_portion <= FLT_EPSILON
        ? 1 : params.subsample_portion;
@@ -319,19 +359,18 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
        *subsample_test = cvMat( 1, test_sample_count, CV_32SC1,
                                 idx_data + train_sample_count );
    }
-
-
+    
    // training procedure

    for ( int i=0; i < params.weak_count; ++i )
    {
-        for ( int m=0; m < class_count; ++m )
+		do_subsample();
+        for ( int k=0; k < class_count; ++k )
        {
-            do_subsample();
-            find_gradient(m);
+            find_gradient(k);
            CvDTree* tree = new CvDTree;
            tree->train( data, subsample_train );
-            change_values(tree, m);
+            change_values(tree, k);

            if (subsample_test)
            {
@@ -343,30 +382,35 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
                             : sample_idx->step/CV_ELEM_SIZE(sample_idx->type);
                for (int j=0; j<get_len(subsample_test); ++j)
                {
-                    for (int k=0; k<class_count; ++k)
-                    {
-                        int idx = *(sample_data + subsample_data[j]*s_step);
-                        float res = 0.0f;
+                    int idx = *(sample_data + subsample_data[j]*s_step);
+                    float res = 0.0f;
+                    if (_tflag == CV_ROW_SAMPLE)
                        cvGetRow( data->train_data, &x, idx);
-                        if (missing)
-                        {
+                    else
+                        cvGetCol( data->train_data, &x, idx);
+                        
+                    if (missing)
+                    {
+                        if (_tflag == CV_ROW_SAMPLE)
                            cvGetRow( missing, &x_miss, idx);
-                            res = (float)tree->predict(&x, &x_miss)->value;
-                        }
                        else
-                        {
-                            res = (float)tree->predict(&x)->value;
-                        }
-                        sum_response_tmp->data.fl[idx + k*len] = 
-                                        sum_response->data.fl[idx + k*len] +
-                                        params.shrinkage * res;
+                            cvGetCol( missing, &x_miss, idx);
+                        
+                        res = (float)tree->predict(&x, &x_miss)->value;
+                    }
+                    else
+                    {
+                        res = (float)tree->predict(&x)->value;
                    }
+                    sum_response_tmp->data.fl[idx + k*n] = 
+                                    sum_response->data.fl[idx + k*n] +
+                                    params.shrinkage * res;
                }
            }

-            cvSeqPush( weak[m], &tree );
+            cvSeqPush( weak[k], &tree );
            tree = 0;
-        } // m=0..class_count
+        } // k=0..class_count
    CvMat* tmp;
    tmp = sum_response_tmp;
    sum_response_tmp = sum_response;
@@ -377,7 +421,8 @@ CvGBTrees::train( const CvMat* _train_data, int _tflag,
    delete[] idx_data;
    cvReleaseMat(&new_responses);
    data->free_train_data();
-    return true;
+
+	return true;

 } // CvGBTrees::train(...)

@@ -506,17 +551,26 @@ void CvGBTrees::change_values(CvDTree* tree, const int _k)

    for (int i=0; i<get_len(subsample_train); ++i)
    {
-        int idx = *(sample_data + subsample_data[i]*s_step);
-        cvGetRow( data->train_data, &x, idx);
+		int idx = *(sample_data + subsample_data[i]*s_step);
+		if (data->tflag == CV_ROW_SAMPLE)
+            cvGetRow( data->train_data, &x, idx);
+        else
+            cvGetCol( data->train_data, &x, idx);
+            
        if (missing)
        {
-            cvGetRow( missing, &miss_x, idx);
+            if (data->tflag == CV_ROW_SAMPLE)
+                cvGetRow( missing, &miss_x, idx);
+            else
+                cvGetCol( missing, &miss_x, idx);
+            
            predictions[i] = tree->predict(&x, &miss_x);
        }
-        else 
+        else
            predictions[i] = tree->predict(&x);
    }

+
    CvDTreeNode** leaves;
    int leaves_count = 0;
    leaves = GetLeaves( tree, leaves_count);
@@ -574,6 +628,7 @@ void CvGBTrees::change_values(CvDTree* tree, const int _k)
        leaves[i] = 0;
    }
    delete[] leaves;
+
 }

 //===========================================================================
@@ -583,6 +638,9 @@ void CvGBTrees::change_values(CvDTree* tree, const int _k)
    
    CvDTreeNode** leaves;
    int leaves_count = 0;
+	int offset = _k*sum_response_tmp->cols;
+	CvMat leaf_idx;
+	leaf_idx.rows = 1;
    
    leaves = GetLeaves( tree, leaves_count);

@@ -591,21 +649,26 @@ void CvGBTrees::change_values(CvDTree* tree, const int _k)
        int n = leaves[i]->sample_count;
        int* leaf_idx_data = new int[n];
        data->get_sample_indices(leaves[i], leaf_idx_data);
-        CvMat* leaf_idx = 0;
-        cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
+        //CvMat* leaf_idx = new CvMat();
+        //cvInitMatHeader(leaf_idx, n, 1, CV_32S, leaf_idx_data);
+		leaf_idx.cols = n;
+		leaf_idx.data.i = leaf_idx_data;

-        float value = find_optimal_value(leaf_idx);
+        float value = find_optimal_value(&leaf_idx);
        leaves[i]->value = value;
+		float val = params.shrinkage * value;

-        int len = sum_response_tmp->cols;
+        
        for (int j=0; j<n; ++j)
        {
-            int idx = leaf_idx_data[j] + _k*len;
-            sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] +
-                                             params.shrinkage * value;
+            int idx = leaf_idx_data[j] + offset;
+            sum_response_tmp->data.fl[idx] = sum_response->data.fl[idx] + val;
        }
-        leaf_idx_data = 0;
-        cvReleaseMat(&leaf_idx);
+        //leaf_idx_data = 0;
+        //cvReleaseMat(&leaf_idx);
+		leaf_idx.data.i = 0;
+		//delete leaf_idx;
+		delete[] leaf_idx_data;
    }

    // releasing the memory
@@ -614,6 +677,7 @@ void CvGBTrees::change_values(CvDTree* tree, const int _k)
        leaves[i] = 0;
    }
    delete[] leaves;
+
 }    //change_values(...);
 */
 //===========================================================================

--- a/modules/ml/test/test_gbttest.cpp
+++ b/modules/ml/test/test_gbttest.cpp
@@ -25,6 +25,7 @@ protected:
    
    string model_file_name1;
    string model_file_name2;
+
    string* datasets;
    string data_path;
    
@@ -33,6 +34,8 @@ protected:
    
    vector<float> test_resps1;
    vector<float> test_resps2;
+
+	int64 initSeed;
 };


@@ -44,6 +47,18 @@ int _get_len(const CvMat* mat)

 CV_GBTreesTest::CV_GBTreesTest()
 {
+	int64 seeds[] = { CV_BIG_INT(0x00009fff4f9c8d52),
+                      CV_BIG_INT(0x0000a17166072c7c),
+                      CV_BIG_INT(0x0201b32115cd1f9a),
+                      CV_BIG_INT(0x0513cb37abcd1234),
+                      CV_BIG_INT(0x0001a2b3c4d5f678)
+                    };
+
+    int seedCount = sizeof(seeds)/sizeof(seeds[0]);
+	cv::RNG& rng = cv::theRNG();
+    initSeed = rng.state;
+    rng.state = seeds[rng(seedCount)];
+
    datasets = 0;
    data = 0;
    gtb = 0;
@@ -54,6 +69,7 @@ CV_GBTreesTest::~CV_GBTreesTest()
    if (data)
        delete data;
    delete[] datasets;
+	cv::theRNG().state = initSeed;
 }


@@ -65,7 +81,7 @@ int CV_GBTreesTest::TestTrainPredict(int test_num)
    float shrinkage = 0.1f;
    float subsample_portion = 0.5f;
    int max_depth = 5;
-    bool use_surrogates = true;
+    bool use_surrogates = false;
    int loss_function_type = 0;
    switch (test_num)
    {
@@ -137,8 +153,10 @@ int CV_GBTreesTest::checkPredictError(int test_num)
    if (!gtb)
        return cvtest::TS::FAIL_GENERIC;
        
-    float mean[] = {5.430247f, 13.5654f, 12.6569f, 13.1661f};
-    float sigma[] = {0.4162694f, 3.21161f, 3.43297f, 3.00624f};
+    //float mean[] = {5.430247f, 13.5654f, 12.6569f, 13.1661f};
+    //float sigma[] = {0.4162694f, 3.21161f, 3.43297f, 3.00624f};
+	float mean[] = {5.80226f, 12.68689f, 13.49095f, 13.19628f};
+    float sigma[] = {0.4764534f, 3.166919f, 3.022405f, 2.868722f};
    
    float current_error = gtb->calc_error(data, CV_TEST_ERROR);
    

--- a/samples/cpp/points_classifier.cpp
+++ b/samples/cpp/points_classifier.cpp
@@ -22,9 +22,9 @@ vector<Scalar> classColors;
 #define NBC 0 // normal Bayessian classifier
 #define KNN 0 // k nearest neighbors classifier
 #define SVM 0 // support vectors machine
-#define DT  1 // decision tree
+#define DT  0 // decision tree
 #define BT  0 // ADA Boost
-#define GBT 0 // gradient boosted trees
+#define GBT 1 // gradient boosted trees
 #define RF  0 // random forest
 #define ERT 0 // extremely randomized trees
 #define ANN 0 // artificial neural networks
@@ -272,7 +272,6 @@ void find_decision_boundary_GBT()

    Mat trainSamples, trainClasses;
    prepare_train_data( trainSamples, trainClasses );
-    trainClasses.convertTo( trainClasses, CV_32FC1 );

    // learn classifier
    CvGBTrees gbtrees;