Merge remote-tracking branch 'origin/2.4' into merge-2.4

Conflicts: modules/core/include/opencv2/core/operations.hpp modules/core/include/opencv2/core/version.hpp modules/core/src/gpumat.cpp modules/cudaimgproc/src/color.cpp modules/features2d/src/orb.cpp modules/imgproc/src/samplers.cpp modules/ocl/include/opencv2/ocl/matrix_operations.hpp modules/ocl/include/opencv2/ocl/ocl.hpp samples/ocl/facedetect.cpp

Merge remote-tracking branch 'origin/2.4' into merge-2.4
Conflicts: modules/core/include/opencv2/core/operations.hpp modules/core/include/opencv2/core/version.hpp modules/core/src/gpumat.cpp modules/cudaimgproc/src/color.cpp modules/features2d/src/orb.cpp modules/imgproc/src/samplers.cpp modules/ocl/include/opencv2/ocl/matrix_operations.hpp modules/ocl/include/opencv2/ocl/ocl.hpp samples/ocl/facedetect.cpp
464a010f · Roman Donchenko · 8db53c17 · 8f10ca51 · 464a010f · 464a010f
26 changed file
--- a/3rdparty/include/opencl/1.2/CL/cl_platform.h
+++ b/3rdparty/include/opencl/1.2/CL/cl_platform.h
@@ -454,7 +454,7 @@ typedef unsigned int cl_GLenum;
 /* Define alignment keys */
 #if defined( __GNUC__ )
    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
-#elif defined( _WIN32) && (_MSC_VER)
+#elif defined( _WIN32) && defined(_MSC_VER)
    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
    /* #include <crtdefs.h>                                                                                             */

--- a/apps/traincascade/imagestorage.cpp
+++ b/apps/traincascade/imagestorage.cpp
@@ -70,7 +70,7 @@ bool CvCascadeImageReader::NegReader::nextImg()
        _offset.x = std::min( (int)round % winSize.width, src.cols - winSize.width );
        _offset.y = std::min( (int)round / winSize.width, src.rows - winSize.height );
        if( !src.empty() && src.type() == CV_8UC1
-                && offset.x >= 0 && offset.y >= 0 )
+                && _offset.x >= 0 && _offset.y >= 0 )
            break;
    }


--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@@ -105,7 +105,7 @@ Building OpenCV
 Enable hardware optimizations
 -----------------------------

-Depending on target platfrom architecture different instruction sets can be used. By default
+Depending on target platform architecture different instruction sets can be used. By default
 compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
 to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
 NEON SIMD extensions.

--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
@@ -2110,6 +2110,8 @@ void cv::cuda::cvtColor(InputArray src, OutputArray dst, int code, int dcn, Stre

 void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn, Stream& stream)
 {
+    CV_Assert( !_src.empty() );
+
    switch (code)
    {
    case cv::COLOR_BayerBG2GRAY: case cv::COLOR_BayerGB2GRAY: case cv::COLOR_BayerRG2GRAY: case cv::COLOR_BayerGR2GRAY:

--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@@ -2357,6 +2357,7 @@ struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
 CUDA_TEST_P(Demosaicing, BayerBG2BGR)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(1, 1));
@@ -2370,6 +2371,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR)
 CUDA_TEST_P(Demosaicing, BayerGB2BGR)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(0, 1));
@@ -2383,6 +2385,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR)
 CUDA_TEST_P(Demosaicing, BayerRG2BGR)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(0, 0));
@@ -2396,6 +2399,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR)
 CUDA_TEST_P(Demosaicing, BayerGR2BGR)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(1, 0));
@@ -2409,6 +2413,7 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR)
 CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(1, 1));
@@ -2422,6 +2427,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(0, 1));
@@ -2435,6 +2441,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(0, 0));
@@ -2448,6 +2455,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
 {
    cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";

    cv::Mat_<uchar> src;
    mosaic(img, src, cv::Point(1, 0));

--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp
@@ -141,12 +141,12 @@ static void computeOrbDescriptor(const KeyPoint& kpt,
    float x, y;
    int ix, iy;
 #if 1
-#define GET_VALUE(idx) \
-       (x = pattern[idx].x*a - pattern[idx].y*b, \
-        y = pattern[idx].x*b + pattern[idx].y*a, \
-        ix = cvRound(x), \
-        iy = cvRound(y), \
-        *(center + iy*step + ix) )
+    #define GET_VALUE(idx) \
+           (x = pattern[idx].x*a - pattern[idx].y*b, \
+            y = pattern[idx].x*b + pattern[idx].y*a, \
+            ix = cvRound(x), \
+            iy = cvRound(y), \
+            *(center + iy*step + ix) )
 #else
    #define GET_VALUE(idx) \
        (x = pattern[idx].x*a - pattern[idx].y*b, \

--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -1552,9 +1552,9 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
            // image origin is not necessarily at (0,0)
            int x0 = (widget->allocation.width - image_widget->scaled_image->cols)/2;
            int y0 = (widget->allocation.height - image_widget->scaled_image->rows)/2;
-            pt.x = cvRound( ((pt32f.x-x0)*image_widget->original_image->cols)/
+            pt.x = cvFloor( ((pt32f.x-x0)*image_widget->original_image->cols)/
                                            image_widget->scaled_image->cols );
-            pt.y = cvRound( ((pt32f.y-y0)*image_widget->original_image->rows)/
+            pt.y = cvFloor( ((pt32f.y-y0)*image_widget->original_image->rows)/
                                            image_widget->scaled_image->rows );
        }
        else{

--- a/modules/imgproc/src/samplers.cpp
+++ b/modules/imgproc/src/samplers.cpp
@@ -64,7 +64,7 @@ adjustRect( const uchar* src, size_t src_step, int pix_size,
            rect.x = win_size.width;
    }

-    if( ip.x + win_size.width < src_size.width )
+    if( ip.x < src_size.width - win_size.width )
        rect.width = win_size.width;
    else
    {
@@ -85,7 +85,7 @@ adjustRect( const uchar* src, size_t src_step, int pix_size,
    else
        rect.y = -ip.y;

-    if( ip.y + win_size.height < src_size.height )
+    if( ip.y < src_size.height - win_size.height )
        rect.height = win_size.height;
    else
    {
@@ -155,8 +155,8 @@ void getRectSubPix_Cn_(const _Tp* src, size_t src_step, Size src_size,
    src_step /= sizeof(src[0]);
    dst_step /= sizeof(dst[0]);

-    if( 0 <= ip.x && ip.x + win_size.width < src_size.width &&
-       0 <= ip.y && ip.y + win_size.height < src_size.height )
+    if( 0 <= ip.x && ip.x < src_size.width - win_size.width &&
+       0 <= ip.y && ip.y < src_size.height - win_size.height)
    {
        // extracted rectangle is totally inside the image
        src += ip.y * src_step + ip.x*cn;

--- a/modules/ocl/doc/data_structures.rst
+++ b/modules/ocl/doc/data_structures.rst
@@ -144,14 +144,6 @@ OpenCV C++ 1-D or 2-D dense array class ::
        //! returns true if oclMatrix data is NULL
        bool empty() const;

-        //! returns pointer to y-th row
-        uchar* ptr(int y = 0);
-        const uchar *ptr(int y = 0) const;
-
-        //! template version of the above method
-        template<typename _Tp> _Tp *ptr(int y = 0);
-        template<typename _Tp> const _Tp *ptr(int y = 0) const;
-
        //! matrix transposition
        oclMat t() const;


--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -378,14 +378,6 @@ namespace cv
            //! returns true if oclMatrix data is NULL
            bool empty() const;

-            //! returns pointer to y-th row
-            uchar* ptr(int y = 0);
-            const uchar *ptr(int y = 0) const;
-
-            //! template version of the above method
-            template<typename _Tp> _Tp *ptr(int y = 0);
-            template<typename _Tp> const _Tp *ptr(int y = 0) const;
-
            //! matrix transposition
            oclMat t() const;


--- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
+++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
@@ -456,36 +456,6 @@ namespace cv
            return data == 0;
        }

-
-
-        inline uchar *oclMat::ptr(int y)
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return data + step * y;
-        }
-
-        inline const uchar *oclMat::ptr(int y) const
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return data + step * y;
-        }
-
-        template<typename _Tp> inline _Tp *oclMat::ptr(int y)
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return (_Tp *)(data + step * y);
-        }
-
-        template<typename _Tp> inline const _Tp *oclMat::ptr(int y) const
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return (const _Tp *)(data + step * y);
-        }
-
        inline oclMat oclMat::t() const
        {
            oclMat tmp;

--- a/modules/ocl/perf/perf_moments.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@@ -73,10 +73,10 @@ PERF_TEST_P(MomentsFixture, Moments,
    Mat  src(srcSize, type), dst(7, 1, CV_64F);
    randu(src, 0, 255);

-    oclMat src_d(src);
    cv::Moments mom;
    if (RUN_OCL_IMPL)
    {
+        oclMat src_d(src);
        OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
    }
    else if (RUN_PLAIN_IMPL)

--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -676,7 +676,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
    ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
    ensureSizeIsEnough(1, nQuery, CV_32F, distance);

-    matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
+    matchDispatcher(query, &trainCollection, trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);

    return;
 }

--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@@ -290,8 +290,9 @@ void openCLFree(void *devPtr)
        }
 #else
        // TODO FIXIT Attach clReleaseMemObject call to event completion callback
-        Context* ctx = Context::getContext();
-        clFinish(getClCommandQueue(ctx));
+        // TODO 2013/12/04 Disable workaround
+        // Context* ctx = Context::getContext();
+        // clFinish(getClCommandQueue(ctx));
 #endif
        openCLSafeCall(clReleaseMemObject(data.mainBuffer));
    }

--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -62,13 +62,13 @@ typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 GpuHidHaarTreeNode;


-typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
-{
-    int count __attribute__((aligned (4)));
-    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
-    float* alpha __attribute__((aligned (8)));
-}
-GpuHidHaarClassifier;
+//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
+//{
+//    int count __attribute__((aligned (4)));
+//    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+//    float* alpha __attribute__((aligned (8)));
+//}
+//GpuHidHaarClassifier;


 typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
@@ -84,22 +84,22 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
 GpuHidHaarStageClassifier;


-typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
-{
-    int  count __attribute__((aligned (4)));
-    int  is_stump_based __attribute__((aligned (4)));
-    int  has_tilted_features __attribute__((aligned (4)));
-    int  is_tree __attribute__((aligned (4)));
-    int pq0 __attribute__((aligned (4)));
-    int pq1 __attribute__((aligned (4)));
-    int pq2 __attribute__((aligned (4)));
-    int pq3 __attribute__((aligned (4)));
-    int p0 __attribute__((aligned (4)));
-    int p1 __attribute__((aligned (4)));
-    int p2 __attribute__((aligned (4)));
-    int p3 __attribute__((aligned (4)));
-    float inv_window_area __attribute__((aligned (4)));
-} GpuHidHaarClassifierCascade;
+//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
+//{
+//    int  count __attribute__((aligned (4)));
+//    int  is_stump_based __attribute__((aligned (4)));
+//    int  has_tilted_features __attribute__((aligned (4)));
+//    int  is_tree __attribute__((aligned (4)));
+//    int pq0 __attribute__((aligned (4)));
+//    int pq1 __attribute__((aligned (4)));
+//    int pq2 __attribute__((aligned (4)));
+//    int pq3 __attribute__((aligned (4)));
+//    int p0 __attribute__((aligned (4)));
+//    int p1 __attribute__((aligned (4)));
+//    int p2 __attribute__((aligned (4)));
+//    int p3 __attribute__((aligned (4)));
+//    float inv_window_area __attribute__((aligned (4)));
+//} GpuHidHaarClassifierCascade;


 #ifdef PACKED_CLASSIFIER
@@ -196,10 +196,12 @@ __kernel void gpuRunHaarClassifierCascadePacked(
    for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
    {// iterate until candidate is exist
        float   stage_sum = 0.0f;
-        int2    stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-        float   stagethreshold = as_float(stageinfo.y);
+        __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+            ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+        int stagecount = stageinfo->count;
+        float stagethreshold = stageinfo->threshold;
        int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
-        for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
+        for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
        {
        // simple macro to extract shorts from int
 #define M0(_t) ((_t)&0xFFFF)
@@ -355,14 +357,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
            variance_norm_factor = variance_norm_factor * correction - mean * mean;
            variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;

-            for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
+            for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
            {
                float stage_sum = 0.f;
-                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-                float stagethreshold = as_float(stageinfo.y);
-                for(int nodeloop = 0; nodeloop < stageinfo.x; )
+                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+                int stagecount = stageinfo->count;
+                float stagethreshold = stageinfo->threshold;
+                for(int nodeloop = 0; nodeloop < stagecount; )
                {
-                    __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+                    __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                        (((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode));

                    int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                    int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
@@ -418,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 #endif
                }

-                result = (stage_sum >= stagethreshold);
+                result = (stage_sum >= stagethreshold) ? 1 : 0;
            }
            if(factor < 2)
            {
@@ -447,14 +452,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                lclcount[0]=0;
                barrier(CLK_LOCAL_MEM_FENCE);

-                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-                float stagethreshold = as_float(stageinfo.y);
+                //int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+                int stagecount = stageinfo->count;
+                float stagethreshold = stageinfo->threshold;

                int perfscale = queuecount > 4 ? 3 : 2;
                int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
                int lcl_compute_win = lcl_sz >> perfscale;
                int lcl_compute_win_id = (lcl_id >>(6-perfscale));
-                int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
+                int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale);
                int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
                for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
                {
@@ -469,10 +477,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                        float part_sum = 0.f;
                        const int stump_factor = STUMP_BASED ? 1 : 2;
                        int root_offset = 0;
-                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
+                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stagecount;)
                        {
-                            __global GpuHidHaarTreeNode* currentnodeptr =
-                                nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
+                            __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                                    (((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset));

                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
@@ -549,7 +557,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa

                queuecount = lclcount[0];
                barrier(CLK_LOCAL_MEM_FENCE);
-                nodecounter += stageinfo.x;
+                nodecounter += stagecount;
            }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)

            if(lcl_id<queuecount)

--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -59,13 +59,13 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
    int right __attribute__((aligned(4)));
 }
 GpuHidHaarTreeNode;
-typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
-{
-    int count __attribute__((aligned(4)));
-    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
-    float *alpha __attribute__((aligned(8)));
-}
-GpuHidHaarClassifier;
+//typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
+//{
+//    int count __attribute__((aligned(4)));
+//    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
+//    float *alpha __attribute__((aligned(8)));
+//}
+//GpuHidHaarClassifier;
 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
 {
    int  count __attribute__((aligned(4)));
@@ -77,29 +77,29 @@ typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
    int reserved3 __attribute__((aligned(8)));
 }
 GpuHidHaarStageClassifier;
-typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
-{
-    int  count __attribute__((aligned(4)));
-    int  is_stump_based __attribute__((aligned(4)));
-    int  has_tilted_features __attribute__((aligned(4)));
-    int  is_tree __attribute__((aligned(4)));
-    int pq0 __attribute__((aligned(4)));
-    int pq1 __attribute__((aligned(4)));
-    int pq2 __attribute__((aligned(4)));
-    int pq3 __attribute__((aligned(4)));
-    int p0 __attribute__((aligned(4)));
-    int p1 __attribute__((aligned(4)));
-    int p2 __attribute__((aligned(4)));
-    int p3 __attribute__((aligned(4)));
-    float inv_window_area __attribute__((aligned(4)));
-} GpuHidHaarClassifierCascade;
+//typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
+//{
+//    int  count __attribute__((aligned(4)));
+//    int  is_stump_based __attribute__((aligned(4)));
+//    int  has_tilted_features __attribute__((aligned(4)));
+//    int  is_tree __attribute__((aligned(4)));
+//    int pq0 __attribute__((aligned(4)));
+//    int pq1 __attribute__((aligned(4)));
+//    int pq2 __attribute__((aligned(4)));
+//    int pq3 __attribute__((aligned(4)));
+//    int p0 __attribute__((aligned(4)));
+//    int p1 __attribute__((aligned(4)));
+//    int p2 __attribute__((aligned(4)));
+//    int p3 __attribute__((aligned(4)));
+//    float inv_window_area __attribute__((aligned(4)));
+//} GpuHidHaarClassifierCascade;

 __kernel void gpuRunHaarClassifierCascade_scaled2(
-    global GpuHidHaarStageClassifier *stagecascadeptr,
+    global GpuHidHaarStageClassifier *stagecascadeptr_,
    global int4 *info,
-    global GpuHidHaarTreeNode *nodeptr,
+    global GpuHidHaarTreeNode *nodeptr_,
    global const int *restrict sum,
-    global const float   *restrict sqsum,
+    global const float *restrict sqsum,
    global int4 *candidate,
    const int rows,
    const int cols,
@@ -132,8 +132,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
    int max_idx = rows * cols - 1;
    for (int scalei = 0; scalei < loopcount; scalei++)
    {
-        int4 scaleinfo1;
-        scaleinfo1 = info[scalei];
+        int4 scaleinfo1 = info[scalei];
        int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
        int totalgrp = scaleinfo1.y & 0xffff;
        float factor = as_float(scaleinfo1.w);
@@ -174,15 +173,18 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
                {
                    float stage_sum = 0.f;
-                    int   stagecount = stagecascadeptr[stageloop].count;
+                    __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                        (((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier));
+                    int stagecount = stageinfo->count;
                    for (int nodeloop = 0; nodeloop < stagecount;)
                    {
-                        __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
+                        __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                            (((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode));
                        int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
                        int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
                        int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
                        float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                        float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
+                        float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
                        float nodethreshold  = w.w * variance_norm_factor;

                        info1.x += p_offset;
@@ -204,7 +206,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
                        + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;

-                        bool passThres = classsum >= nodethreshold;
+                        bool passThres = (classsum >= nodethreshold) ? 1 : 0;

 #if STUMP_BASED
                        stage_sum += passThres ? alpha3.y : alpha3.x;
@@ -234,7 +236,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                        }
 #endif
                    }
-                    result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
+
+                    result = (stage_sum >= stageinfo->threshold) ? 1 : 0;
                }

                barrier(CLK_LOCAL_MEM_FENCE);
@@ -281,11 +284,14 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
        }
    }
 }
-__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
+__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum)
 {
-    int counter = get_global_id(0);
+    const int counter = get_global_id(0);
    int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
-    GpuHidHaarTreeNode t1 = *(orinode + counter);
+    GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*)
+        (((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode));
+    __global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*)
+        (((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode));

    #pragma unroll
    for (i = 0; i < 3; i++)
@@ -297,22 +303,21 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
    }

    t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
-    counter += nodenum;

    #pragma unroll
    for (i = 0; i < 3; i++)
    {
-        newnode[counter].p[i][0] = tr_x[i];
-        newnode[counter].p[i][1] = tr_y[i];
-        newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
-        newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
-        newnode[counter].weight[i] = t1.weight[i] * weight_scale;
+        pNew->p[i][0] = tr_x[i];
+        pNew->p[i][1] = tr_y[i];
+        pNew->p[i][2] = tr_x[i] + tr_w[i];
+        pNew->p[i][3] = tr_y[i] + tr_h[i];
+        pNew->weight[i] = t1.weight[i] * weight_scale;
    }

-    newnode[counter].left = t1.left;
-    newnode[counter].right = t1.right;
-    newnode[counter].threshold = t1.threshold;
-    newnode[counter].alpha[0] = t1.alpha[0];
-    newnode[counter].alpha[1] = t1.alpha[1];
-    newnode[counter].alpha[2] = t1.alpha[2];
+    pNew->left = t1.left;
+    pNew->right = t1.right;
+    pNew->threshold = t1.threshold;
+    pNew->alpha[0] = t1.alpha[0];
+    pNew->alpha[1] = t1.alpha[1];
+    pNew->alpha[2] = t1.alpha[2];
 }
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -74,11 +74,11 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src
        VT vthresh = (VT)(thresh);

 #ifdef THRESH_BINARY
-        VT vecValue = sdata > vthresh ? max_val : (VT)(0);
+        VT vecValue = sdata > vthresh ? (VT)max_val : (VT)(0);
 #elif defined THRESH_BINARY_INV
-        VT vecValue = sdata > vthresh ? (VT)(0) : max_val;
+        VT vecValue = sdata > vthresh ? (VT)(0) : (VT)max_val;
 #elif defined THRESH_TRUNC
-        VT vecValue = sdata > vthresh ? thresh : sdata;
+        VT vecValue = sdata > vthresh ? (VT)thresh : sdata;
 #elif defined THRESH_TOZERO
        VT vecValue = sdata > vthresh ? sdata : (VT)(0);
 #elif defined THRESH_TOZERO_INV

--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
    package="org.opencv.engine"
-    android:versionCode="214@ANDROID_PLATFORM_VERSION_CODE@"
-    android:versionName="2.14" >
+    android:versionCode="216@ANDROID_PLATFORM_VERSION_CODE@"
+    android:versionName="2.16" >

    <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
    <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>

--- a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -170,7 +170,7 @@ inline string JoinPlatform(int platform)
    return result;
 }

-inline int SplitPlatfrom(const vector<string>& features)
+inline int SplitPlatform(const vector<string>& features)
 {
    int result = 0;

@@ -419,7 +419,7 @@ InstallPath(install_path)
            return;
        }

-        Platform = SplitPlatfrom(features);
+        Platform = SplitPlatform(features);
        if (PLATFORM_UNKNOWN != Platform)
        {
            switch (Platform)

--- a/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
@@ -170,7 +170,7 @@ TEST(CpuID, CheckVFPv3)
    EXPECT_TRUE(cpu_id & FEATURES_HAS_VFPv3);
 }

-TEST(PlatfromDetector, CheckTegra)
+TEST(PlatformDetector, CheckTegra)
 {
    EXPECT_NE(PLATFORM_UNKNOWN, DetectKnownPlatforms());
 }

--- a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
@@ -90,28 +90,28 @@ public class ManagerActivity extends Activity
        mInstalledPackageView.setAdapter(mInstalledPacksAdapter);

        TextView HardwarePlatformView = (TextView)findViewById(R.id.HardwareValue);
-        int Platfrom = HardwareDetector.DetectKnownPlatforms();
+        int Platform = HardwareDetector.DetectKnownPlatforms();
        int CpuId = HardwareDetector.GetCpuID();

-        if (HardwareDetector.PLATFORM_UNKNOWN != Platfrom)
+        if (HardwareDetector.PLATFORM_UNKNOWN != Platform)
        {
-            if (HardwareDetector.PLATFORM_TEGRA == Platfrom)
+            if (HardwareDetector.PLATFORM_TEGRA == Platform)
            {
                HardwarePlatformView.setText("Tegra");
            }
-            else if (HardwareDetector.PLATFORM_TEGRA2 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA2 == Platform)
            {
                HardwarePlatformView.setText("Tegra 2");
            }
-            else if (HardwareDetector.PLATFORM_TEGRA3 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA3 == Platform)
            {
                HardwarePlatformView.setText("Tegra 3");
            }
-            else if (HardwareDetector.PLATFORM_TEGRA4i == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA4i == Platform)
            {
                HardwarePlatformView.setText("Tegra 4i");
            }
-            else if (HardwareDetector.PLATFORM_TEGRA4 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA4 == Platform)
            {
                HardwarePlatformView.setText("Tegra 4");
            }

--- a/platforms/android/service/readme.txt
+++ b/platforms/android/service/readme.txt
@@ -14,20 +14,20 @@ manually using adb tool:

 .. code-block:: sh

-    adb install OpenCV-2.4.7-android-sdk/apk/OpenCV_2.4.7_Manager_2.14_<platform>.apk
+    adb install OpenCV-2.4.7.1-android-sdk/apk/OpenCV_2.4.7.1_Manager_2.15_<platform>.apk

 Use the table below to determine proper OpenCV Manager package for your device:

-+------------------------------+--------------+----------------------------------------------------+
-| Hardware Platform            | Android ver. | Package name                                       |
-+==============================+==============+====================================================+
-| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_armv7a-neon.apk          |
-+------------------------------+--------------+----------------------------------------------------+
-| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.7_Manager_2.14_armv7a-neon-android8.apk |
-+------------------------------+--------------+----------------------------------------------------+
-| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_armeabi.apk              |
-+------------------------------+--------------+----------------------------------------------------+
-| Intel x86                    |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_x86.apk                  |
-+------------------------------+--------------+----------------------------------------------------+
-| MIPS                         |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_mips.apk                 |
-+------------------------------+--------------+----------------------------------------------------+
+------------------------------+--------------+------------------------------------------------------+
+| Hardware Platform            | Android ver. | Package name                                         |
+==============================+==============+======================================================+
+| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon.apk          |
+------------------------------+--------------+------------------------------------------------------+
+| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon-android8.apk |
+------------------------------+--------------+------------------------------------------------------+
+| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armeabi.apk              |
+------------------------------+--------------+------------------------------------------------------+
+| Intel x86                    |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_x86.apk                  |
+------------------------------+--------------+------------------------------------------------------+
+| MIPS                         |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_mips.apk                 |
+------------------------------+--------------+------------------------------------------------------+
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -29,6 +29,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/cudafilters/include")
  endif()

+  if(HAVE_opencv_ocl)
+    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/ocl/include")
+  endif()
+
  if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
  endif()
@@ -56,6 +60,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
      target_link_libraries(${the_target} opencv_cudaarithm opencv_cudafilters)
    endif()

+    if(HAVE_opencv_ocl)
+      target_link_libraries(${the_target} opencv_ocl)
+    endif()
+
    set_target_properties(${the_target} PROPERTIES
      OUTPUT_NAME "cpp-${sample_kind}-${name}"
      PROJECT_LABEL "(${sample_KIND}) ${name}")

--- a/samples/cpp/bagofwords_classification.cpp
+++ b/samples/cpp/bagofwords_classification.cpp
+#include "opencv2/opencv_modules.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/nonfree/nonfree.hpp"
 #include "opencv2/ml/ml.hpp"
+#ifdef HAVE_OPENCV_OCL
+#define _OCL_SVM_ 1 //select whether using ocl::svm method or not, default is using
+#include "opencv2/ocl/ocl.hpp"
+#endif

 #include <fstream>
 #include <iostream>
@@ -2373,9 +2378,15 @@ static void setSVMTrainAutoParams( CvParamGrid& c_grid, CvParamGrid& gamma_grid,
    degree_grid.step = 0;
 }

+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+static void trainSVMClassifier( cv::ocl::CvSVM_OCL& svm, const SVMTrainParamsExt& svmParamsExt, const string& objClassName, VocData& vocData,
+                               Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
+                               const string& resPath )
+#else
 static void trainSVMClassifier( CvSVM& svm, const SVMTrainParamsExt& svmParamsExt, const string& objClassName, VocData& vocData,
                         Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
                         const string& resPath )
+#endif
 {
    /* first check if a previously trained svm for the current class has been saved to file */
    string svmFilename = resPath + svmsDir + "/" + objClassName + ".xml.gz";
@@ -2448,9 +2459,15 @@ static void trainSVMClassifier( CvSVM& svm, const SVMTrainParamsExt& svmParamsEx
    }
 }

+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+static void computeConfidences( cv::ocl::CvSVM_OCL& svm, const string& objClassName, VocData& vocData,
+                               Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
+                               const string& resPath )
+#else
 static void computeConfidences( CvSVM& svm, const string& objClassName, VocData& vocData,
                         Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
                         const string& resPath )
+#endif
 {
    cout << "*** CALCULATING CONFIDENCES FOR CLASS " << objClassName << " ***" << endl;
    cout << "CALCULATING BOW VECTORS FOR TEST SET OF " << objClassName << "..." << endl;
@@ -2589,7 +2606,11 @@ int main(int argc, char** argv)
    for( size_t classIdx = 0; classIdx < objClasses.size(); ++classIdx )
    {
        // Train a classifier on train dataset
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+        cv::ocl::CvSVM_OCL svm;
+#else
        CvSVM svm;
+#endif
        trainSVMClassifier( svm, svmTrainParamsExt, objClasses[classIdx], vocData,
                            bowExtractor, featureDetector, resPath );


--- a/samples/cpp/points_classifier.cpp
+++ b/samples/cpp/points_classifier.cpp
+#include "opencv2/opencv_modules.hpp"
 #include "opencv2/core/core.hpp"
 #include "opencv2/ml/ml.hpp"
 #include "opencv2/highgui/highgui.hpp"
+#ifdef HAVE_OPENCV_OCL
+#define _OCL_KNN_ 1 // select whether using ocl::KNN method or not, default is using
+#define _OCL_SVM_ 1 // select whether using ocl::svm method or not, default is using
+#include "opencv2/ocl/ocl.hpp"
+#endif

 #include <stdio.h>

@@ -133,7 +139,14 @@ static void find_decision_boundary_KNN( int K )
    prepare_train_data( trainSamples, trainClasses );

    // learn classifier
+#if defined HAVE_OPENCV_OCL && _OCL_KNN_
+    cv::ocl::KNearestNeighbour knnClassifier;
+    Mat temp, result;
+    knnClassifier.train(trainSamples, trainClasses, temp, false, K);
+    cv::ocl::oclMat testSample_ocl, reslut_ocl;
+#else
    CvKNearest knnClassifier( trainSamples, trainClasses, Mat(), false, K );
+#endif

    Mat testSample( 1, 2, CV_32FC1 );
    for( int y = 0; y < img.rows; y += testStep )
@@ -142,9 +155,19 @@ static void find_decision_boundary_KNN( int K )
        {
            testSample.at<float>(0) = (float)x;
            testSample.at<float>(1) = (float)y;
+#if defined HAVE_OPENCV_OCL && _OCL_KNN_
+            testSample_ocl.upload(testSample);
+
+            knnClassifier.find_nearest(testSample_ocl, K, reslut_ocl);
+
+            reslut_ocl.download(result);
+            int response = saturate_cast<int>(result.at<float>(0));
+            circle(imgDst, Point(x, y), 1, classColors[response]);
+#else

            int response = (int)knnClassifier.find_nearest( testSample, K );
            circle( imgDst, Point(x,y), 1, classColors[response] );
+#endif
        }
    }
 }
@@ -159,7 +182,11 @@ static void find_decision_boundary_SVM( CvSVMParams params )
    prepare_train_data( trainSamples, trainClasses );

    // learn classifier
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+    cv::ocl::CvSVM_OCL svmClassifier(trainSamples, trainClasses, Mat(), Mat(), params);
+#else
    CvSVM svmClassifier( trainSamples, trainClasses, Mat(), Mat(), params );
+#endif

    Mat testSample( 1, 2, CV_32FC1 );
    for( int y = 0; y < img.rows; y += testStep )
@@ -178,7 +205,7 @@ static void find_decision_boundary_SVM( CvSVMParams params )
    for( int i = 0; i < svmClassifier.get_support_vector_count(); i++ )
    {
        const float* supportVector = svmClassifier.get_support_vector(i);
-        circle( imgDst, Point(supportVector[0],supportVector[1]), 5, Scalar(255,255,255), -1 );
+        circle( imgDst, Point(saturate_cast<int>(supportVector[0]),saturate_cast<int>(supportVector[1])), 5, CV_RGB(255,255,255), -1 );
    }

 }

--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@ -8,11 +8,16 @@
 #include <iostream>
 #include <stdio.h>

+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+    # include <thread>
+#endif

 using namespace std;
 using namespace cv;
 #define LOOP_NUM 1

+///////////////////////////single-threading faces detecting///////////////////////////////
+
 const static Scalar colors[] =  { CV_RGB(0,0,255),
                                  CV_RGB(0,128,255),
                                  CV_RGB(0,255,255),
@@ -26,7 +31,7 @@ const static Scalar colors[] =  { CV_RGB(0,0,255),

 int64 work_begin = 0;
 int64 work_end = 0;
-string outputName;
+string inputName, outputName, cascadeName;

 static void workBegin()
 {
@@ -61,41 +66,17 @@ static void Draw(Mat& img, vector<Rect>& faces, double scale);
 // Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
 double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);

-int main( int argc, const char** argv )
+static int facedetect_one_thread(bool useCPU, double scale )
 {
-    const char* keys =
-        "{ h  help       | false       | print help message }"
-        "{ i  input      |             | specify input image }"
-        "{ t  template   | haarcascade_frontalface_alt.xml |"
-        " specify template file path }"
-        "{ c  scale      |   1.0       | scale image }"
-        "{ s  use_cpu    | false       | use cpu or gpu to process the image }"
-        "{ o  output     | facedetect_output.jpg  |"
-        " specify output image save path(only works when input is images) }";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
-    {
-        cout << "Usage : facedetect [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
    CvCapture* capture = 0;
    Mat frame, frameCopy0, frameCopy, image;

-    bool useCPU = cmd.get<bool>("s");
-    string inputName = cmd.get<string>("i");
-    outputName = cmd.get<string>("o");
-    string cascadeName = cmd.get<string>("t");
-    double scale = cmd.get<double>("c");
    ocl::OclCascadeClassifier cascade;
    CascadeClassifier  cpu_cascade;

    if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
    {
-        cout << "ERROR: Could not load classifier cascade" << endl;
+        cout << "ERROR: Could not load classifier cascade: " << cascadeName << endl;
        return EXIT_FAILURE;
    }

@@ -186,9 +167,114 @@ int main( int argc, const char** argv )
    }

    cvDestroyWindow("result");
+    std::cout<< "single-threaded sample has finished" <<std::endl;
    return 0;
 }

+///////////////////////////////////////detectfaces with multithreading////////////////////////////////////////////
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+
+#define MAX_THREADS 10
+
+static void detectFaces(std::string fileName)
+{
+    ocl::OclCascadeClassifier cascade;
+    if(!cascade.load(cascadeName))
+    {
+        std::cout << "ERROR: Could not load classifier cascade: " << cascadeName << std::endl;
+        return;
+    }
+
+    Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR);
+    if (img.empty())
+    {
+        std::cout << "cann't open file " + fileName <<std::endl;
+        return;
+    }
+
+    ocl::oclMat d_img;
+    d_img.upload(img);
+
+    std::vector<Rect> oclfaces;
+    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0|CV_HAAR_SCALE_IMAGE, Size(30, 30), Size(0, 0));
+
+    for(unsigned int i = 0; i<oclfaces.size(); i++)
+        rectangle(img, Point(oclfaces[i].x, oclfaces[i].y), Point(oclfaces[i].x + oclfaces[i].width, oclfaces[i].y + oclfaces[i].height), colors[i%8], 3);
+
+    std::string::size_type pos = outputName.rfind('.');
+    std::string outputNameTid = outputName + '-' + std::to_string(_threadid);
+    if(pos == std::string::npos)
+    {
+        std::cout << "Invalid output file name: " << outputName << std::endl;
+    }
+    else
+    {
+        outputNameTid = outputName.substr(0, pos) + "_" + std::to_string(_threadid) + outputName.substr(pos);
+        imwrite(outputNameTid, img);
+    }
+    imshow(outputNameTid, img);
+    waitKey(0);
+}
+
+static void facedetect_multithreading(int nthreads)
+{
+    int thread_number = MAX_THREADS < nthreads ? MAX_THREADS : nthreads;
+    std::vector<std::thread> threads;
+    for(int i = 0; i<thread_number; i++)
+        threads.push_back(std::thread(detectFaces, inputName));
+    for(int i = 0; i<thread_number; i++)
+        threads[i].join();
+}
+#endif
+
+int main( int argc, const char** argv )
+{
+
+    const char* keys =
+        "{ h help       | false       | print help message }"
+        "{ i input      |             | specify input image }"
+        "{ t template   | haarcascade_frontalface_alt.xml |"
+        " specify template file path }"
+        "{ c scale      |   1.0       | scale image }"
+        "{ s use_cpu    | false       | use cpu or gpu to process the image }"
+        "{ o output     | facedetect_output.jpg  |"
+        " specify output image save path(only works when input is images) }"
+        "{ n thread_num |      1      | set number of threads >= 1 }";
+
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.has("help"))
+    {
+        cout << "Usage : facedetect [options]" << endl;
+        cout << "Available options:" << endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+    bool useCPU = cmd.get<bool>("s");
+    inputName = cmd.get<string>("i");
+    outputName = cmd.get<string>("o");
+    cascadeName = cmd.get<string>("t");
+    double scale = cmd.get<double>("c");
+    int n = cmd.get<int>("n");
+
+    if(n > 1)
+    {
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+            std::cout<<"multi-threaded sample is running" <<std::endl;
+            facedetect_multithreading(n);
+            std::cout<<"multi-threaded sample has finished" <<std::endl;
+            return 0;
+#else
+            std::cout << "std::thread is not supported, running a single-threaded version" << std::endl;
+#endif
+    }
+    if (n<0)
+        std::cout<<"incorrect number of threads:" << n << ", running a single-threaded version" <<std::endl;
+    else
+        std::cout<<"single-threaded sample is running" <<std::endl;
+    return facedetect_one_thread(useCPU, scale);
+
+}
+
 void detect( Mat& img, vector<Rect>& faces,
             ocl::OclCascadeClassifier& cascade,
             double scale)