diff --git a/3rdparty/include/opencl/1.2/CL/cl_platform.h b/3rdparty/include/opencl/1.2/CL/cl_platform.h
index 46b3d9dcdc33379f79a72ee208025c6351e29e4b..e94949a31cbab68f0c761438fd33b79ccc0366f4 100644
--- a/3rdparty/include/opencl/1.2/CL/cl_platform.h
+++ b/3rdparty/include/opencl/1.2/CL/cl_platform.h
@@ -454,7 +454,7 @@ typedef unsigned int cl_GLenum;
 /* Define alignment keys */
 #if defined( __GNUC__ )
     #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
-#elif defined( _WIN32) && (_MSC_VER)
+#elif defined( _WIN32) && defined(_MSC_VER)
     /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
     /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
     /* #include <crtdefs.h>                                                                                             */
diff --git a/apps/traincascade/imagestorage.cpp b/apps/traincascade/imagestorage.cpp
index 528f9c167d3e820636f3ac9b9648a7842c772b17..e69a7df1ad485f2dfcf390e5afe556889e465db1 100644
--- a/apps/traincascade/imagestorage.cpp
+++ b/apps/traincascade/imagestorage.cpp
@@ -70,7 +70,7 @@ bool CvCascadeImageReader::NegReader::nextImg()
         _offset.x = std::min( (int)round % winSize.width, src.cols - winSize.width );
         _offset.y = std::min( (int)round / winSize.width, src.rows - winSize.height );
         if( !src.empty() && src.type() == CV_8UC1
-                && offset.x >= 0 && offset.y >= 0 )
+                && _offset.x >= 0 && _offset.y >= 0 )
             break;
     }
 
diff --git a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
index c40b86c9740d75c6dcdb9cbd8eb9f547286ca251..0b2253aceac6e88f2de0768834159602fd267574 100644
--- a/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
+++ b/doc/tutorials/introduction/crosscompilation/arm_crosscompile_with_cmake.rst
@@ -105,7 +105,7 @@ Building OpenCV
 Enable hardware optimizations
 -----------------------------
 
-Depending on target platfrom architecture different instruction sets can be used. By default
+Depending on target platform architecture different instruction sets can be used. By default
 compiler generates code for armv5l without VFPv3 and NEON extensions. Add ``-DUSE_VFPV3=ON``
 to cmake command line to enable code generation for VFPv3 and ``-DUSE_NEON=ON`` for using
 NEON SIMD extensions.
diff --git a/modules/cudaimgproc/src/color.cpp b/modules/cudaimgproc/src/color.cpp
index 955c5ef374820f2f6db4faedef8fe85ae70ffe84..a06b746a7f61c6b48b61f0a44dcbef595f7f4cac 100644
--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
@@ -2110,6 +2110,8 @@ void cv::cuda::cvtColor(InputArray src, OutputArray dst, int code, int dcn, Stre
 
 void cv::cuda::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn, Stream& stream)
 {
+    CV_Assert( !_src.empty() );
+
     switch (code)
     {
     case cv::COLOR_BayerBG2GRAY: case cv::COLOR_BayerGB2GRAY: case cv::COLOR_BayerRG2GRAY: case cv::COLOR_BayerGR2GRAY:
diff --git a/modules/cudaimgproc/test/test_color.cpp b/modules/cudaimgproc/test/test_color.cpp
index 6816cc8fd86c76c93c965ed5341248f347ff2058..9188725020a85a9065774493de459a93c4f7a0e5 100644
--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@@ -2357,6 +2357,7 @@ struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
 CUDA_TEST_P(Demosaicing, BayerBG2BGR)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(1, 1));
@@ -2370,6 +2371,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR)
 CUDA_TEST_P(Demosaicing, BayerGB2BGR)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(0, 1));
@@ -2383,6 +2385,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR)
 CUDA_TEST_P(Demosaicing, BayerRG2BGR)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(0, 0));
@@ -2396,6 +2399,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR)
 CUDA_TEST_P(Demosaicing, BayerGR2BGR)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(1, 0));
@@ -2409,6 +2413,7 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR)
 CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(1, 1));
@@ -2422,6 +2427,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(0, 1));
@@ -2435,6 +2441,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(0, 0));
@@ -2448,6 +2455,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
 CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
 {
     cv::Mat img = readImage("stereobm/aloe-L.png");
+    ASSERT_FALSE(img.empty()) << "Can't load input image";
 
     cv::Mat_<uchar> src;
     mosaic(img, src, cv::Point(1, 0));
diff --git a/modules/features2d/src/orb.cpp b/modules/features2d/src/orb.cpp
index a6e8ffab3c8143334e8c1a14397d26b1332e2fc5..263c27350b5ba6ac47724939821b1410855e83cf 100644
--- a/modules/features2d/src/orb.cpp
+++ b/modules/features2d/src/orb.cpp
@@ -141,12 +141,12 @@ static void computeOrbDescriptor(const KeyPoint& kpt,
     float x, y;
     int ix, iy;
 #if 1
-#define GET_VALUE(idx) \
-       (x = pattern[idx].x*a - pattern[idx].y*b, \
-        y = pattern[idx].x*b + pattern[idx].y*a, \
-        ix = cvRound(x), \
-        iy = cvRound(y), \
-        *(center + iy*step + ix) )
+    #define GET_VALUE(idx) \
+           (x = pattern[idx].x*a - pattern[idx].y*b, \
+            y = pattern[idx].x*b + pattern[idx].y*a, \
+            ix = cvRound(x), \
+            iy = cvRound(y), \
+            *(center + iy*step + ix) )
 #else
     #define GET_VALUE(idx) \
         (x = pattern[idx].x*a - pattern[idx].y*b, \
diff --git a/modules/highgui/src/window_gtk.cpp b/modules/highgui/src/window_gtk.cpp
index 79ae638cfc4f7f60f1a18a749e04395d82593fc4..02c25644465c12ad8a9f1b769c04df5cd13874d5 100644
--- a/modules/highgui/src/window_gtk.cpp
+++ b/modules/highgui/src/window_gtk.cpp
@@ -1552,9 +1552,9 @@ static gboolean icvOnMouse( GtkWidget *widget, GdkEvent *event, gpointer user_da
             // image origin is not necessarily at (0,0)
             int x0 = (widget->allocation.width - image_widget->scaled_image->cols)/2;
             int y0 = (widget->allocation.height - image_widget->scaled_image->rows)/2;
-            pt.x = cvRound( ((pt32f.x-x0)*image_widget->original_image->cols)/
+            pt.x = cvFloor( ((pt32f.x-x0)*image_widget->original_image->cols)/
                                             image_widget->scaled_image->cols );
-            pt.y = cvRound( ((pt32f.y-y0)*image_widget->original_image->rows)/
+            pt.y = cvFloor( ((pt32f.y-y0)*image_widget->original_image->rows)/
                                             image_widget->scaled_image->rows );
         }
         else{
diff --git a/modules/imgproc/src/samplers.cpp b/modules/imgproc/src/samplers.cpp
index 81cd23afc0e1ac689dc6627e512ebe0da4a0d65e..d6cc8a56fea5b6a32e643c1a74196d5b1cc47d1f 100644
--- a/modules/imgproc/src/samplers.cpp
+++ b/modules/imgproc/src/samplers.cpp
@@ -64,7 +64,7 @@ adjustRect( const uchar* src, size_t src_step, int pix_size,
             rect.x = win_size.width;
     }
 
-    if( ip.x + win_size.width < src_size.width )
+    if( ip.x < src_size.width - win_size.width )
         rect.width = win_size.width;
     else
     {
@@ -85,7 +85,7 @@ adjustRect( const uchar* src, size_t src_step, int pix_size,
     else
         rect.y = -ip.y;
 
-    if( ip.y + win_size.height < src_size.height )
+    if( ip.y < src_size.height - win_size.height )
         rect.height = win_size.height;
     else
     {
@@ -155,8 +155,8 @@ void getRectSubPix_Cn_(const _Tp* src, size_t src_step, Size src_size,
     src_step /= sizeof(src[0]);
     dst_step /= sizeof(dst[0]);
 
-    if( 0 <= ip.x && ip.x + win_size.width < src_size.width &&
-       0 <= ip.y && ip.y + win_size.height < src_size.height )
+    if( 0 <= ip.x && ip.x < src_size.width - win_size.width &&
+       0 <= ip.y && ip.y < src_size.height - win_size.height)
     {
         // extracted rectangle is totally inside the image
         src += ip.y * src_step + ip.x*cn;
diff --git a/modules/ocl/doc/data_structures.rst b/modules/ocl/doc/data_structures.rst
index 01a16739bafd022d927038bcd56a67492708b229..bde3d14af47b81089a21a5c58da0a3045a58fb11 100644
--- a/modules/ocl/doc/data_structures.rst
+++ b/modules/ocl/doc/data_structures.rst
@@ -144,14 +144,6 @@ OpenCV C++ 1-D or 2-D dense array class ::
         //! returns true if oclMatrix data is NULL
         bool empty() const;
 
-        //! returns pointer to y-th row
-        uchar* ptr(int y = 0);
-        const uchar *ptr(int y = 0) const;
-
-        //! template version of the above method
-        template<typename _Tp> _Tp *ptr(int y = 0);
-        template<typename _Tp> const _Tp *ptr(int y = 0) const;
-
         //! matrix transposition
         oclMat t() const;
 
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index 19af00bef4fa43164c412e3f389a524c7fe81cbc..1f03170e5f8e7480b328f12b03f7bff9b65514db 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -378,14 +378,6 @@ namespace cv
             //! returns true if oclMatrix data is NULL
             bool empty() const;
 
-            //! returns pointer to y-th row
-            uchar* ptr(int y = 0);
-            const uchar *ptr(int y = 0) const;
-
-            //! template version of the above method
-            template<typename _Tp> _Tp *ptr(int y = 0);
-            template<typename _Tp> const _Tp *ptr(int y = 0) const;
-
             //! matrix transposition
             oclMat t() const;
 
diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
index 76db479035a0803d726b2b1d6ca6fcb38e625339..ab1fe4fb7036f38e391af063fd11ae319f01ac9d 100644
--- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
+++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp
@@ -456,36 +456,6 @@ namespace cv
             return data == 0;
         }
 
-
-
-        inline uchar *oclMat::ptr(int y)
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return data + step * y;
-        }
-
-        inline const uchar *oclMat::ptr(int y) const
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return data + step * y;
-        }
-
-        template<typename _Tp> inline _Tp *oclMat::ptr(int y)
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return (_Tp *)(data + step * y);
-        }
-
-        template<typename _Tp> inline const _Tp *oclMat::ptr(int y) const
-        {
-            CV_DbgAssert( (unsigned)y < (unsigned)rows );
-            CV_Error(Error::GpuNotSupported, "This function hasn't been supported yet.\n");
-            return (const _Tp *)(data + step * y);
-        }
-
         inline oclMat oclMat::t() const
         {
             oclMat tmp;
diff --git a/modules/ocl/perf/perf_moments.cpp b/modules/ocl/perf/perf_moments.cpp
index c5d616f83dd81afa848c3a9f2e9a4e3645c656e9..631031ecb49991f49240bbfca044569b617a3353 100644
--- a/modules/ocl/perf/perf_moments.cpp
+++ b/modules/ocl/perf/perf_moments.cpp
@@ -73,10 +73,10 @@ PERF_TEST_P(MomentsFixture, Moments,
     Mat  src(srcSize, type), dst(7, 1, CV_64F);
     randu(src, 0, 255);
 
-    oclMat src_d(src);
     cv::Moments mom;
     if (RUN_OCL_IMPL)
     {
+        oclMat src_d(src);
         OCL_TEST_CYCLE() mom = cv::ocl::ocl_moments(src_d, binaryImage);
     }
     else if (RUN_PLAIN_IMPL)
diff --git a/modules/ocl/src/brute_force_matcher.cpp b/modules/ocl/src/brute_force_matcher.cpp
index d0e09320de675c83d1eb847938a503bdba228388..ca16f43462759d8952f15ede91ca34060c6b2c7e 100644
--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -676,7 +676,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
     ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
     ensureSizeIsEnough(1, nQuery, CV_32F, distance);
 
-    matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
+    matchDispatcher(query, &trainCollection, trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
 
     return;
 }
diff --git a/modules/ocl/src/cl_operations.cpp b/modules/ocl/src/cl_operations.cpp
index 1e415772355132832397b5d5cb8fb7dc139c90c4..aa44c48749da89716b2cb2e572e8d4614d6387a8 100644
--- a/modules/ocl/src/cl_operations.cpp
+++ b/modules/ocl/src/cl_operations.cpp
@@ -290,8 +290,9 @@ void openCLFree(void *devPtr)
         }
 #else
         // TODO FIXIT Attach clReleaseMemObject call to event completion callback
-        Context* ctx = Context::getContext();
-        clFinish(getClCommandQueue(ctx));
+        // TODO 2013/12/04 Disable workaround
+        // Context* ctx = Context::getContext();
+        // clFinish(getClCommandQueue(ctx));
 #endif
         openCLSafeCall(clReleaseMemObject(data.mainBuffer));
     }
diff --git a/modules/ocl/src/opencl/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl
index a62b3af8cbf2eb0adc979e7b480d69fecd40f795..980e85dd27cba7ac7264800dc1fcbf9b8d2746f5 100644
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -62,13 +62,13 @@ typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
 GpuHidHaarTreeNode;
 
 
-typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
-{
-    int count __attribute__((aligned (4)));
-    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
-    float* alpha __attribute__((aligned (8)));
-}
-GpuHidHaarClassifier;
+//typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
+//{
+//    int count __attribute__((aligned (4)));
+//    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+//    float* alpha __attribute__((aligned (8)));
+//}
+//GpuHidHaarClassifier;
 
 
 typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
@@ -84,22 +84,22 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
 GpuHidHaarStageClassifier;
 
 
-typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
-{
-    int  count __attribute__((aligned (4)));
-    int  is_stump_based __attribute__((aligned (4)));
-    int  has_tilted_features __attribute__((aligned (4)));
-    int  is_tree __attribute__((aligned (4)));
-    int pq0 __attribute__((aligned (4)));
-    int pq1 __attribute__((aligned (4)));
-    int pq2 __attribute__((aligned (4)));
-    int pq3 __attribute__((aligned (4)));
-    int p0 __attribute__((aligned (4)));
-    int p1 __attribute__((aligned (4)));
-    int p2 __attribute__((aligned (4)));
-    int p3 __attribute__((aligned (4)));
-    float inv_window_area __attribute__((aligned (4)));
-} GpuHidHaarClassifierCascade;
+//typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
+//{
+//    int  count __attribute__((aligned (4)));
+//    int  is_stump_based __attribute__((aligned (4)));
+//    int  has_tilted_features __attribute__((aligned (4)));
+//    int  is_tree __attribute__((aligned (4)));
+//    int pq0 __attribute__((aligned (4)));
+//    int pq1 __attribute__((aligned (4)));
+//    int pq2 __attribute__((aligned (4)));
+//    int pq3 __attribute__((aligned (4)));
+//    int p0 __attribute__((aligned (4)));
+//    int p1 __attribute__((aligned (4)));
+//    int p2 __attribute__((aligned (4)));
+//    int p3 __attribute__((aligned (4)));
+//    float inv_window_area __attribute__((aligned (4)));
+//} GpuHidHaarClassifierCascade;
 
 
 #ifdef PACKED_CLASSIFIER
@@ -196,10 +196,12 @@ __kernel void gpuRunHaarClassifierCascadePacked(
     for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
     {// iterate until candidate is exist
         float   stage_sum = 0.0f;
-        int2    stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-        float   stagethreshold = as_float(stageinfo.y);
+        __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+            ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+        int stagecount = stageinfo->count;
+        float stagethreshold = stageinfo->threshold;
         int     lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
-        for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
+        for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
         {
         // simple macro to extract shorts from int
 #define M0(_t) ((_t)&0xFFFF)
@@ -355,14 +357,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
             variance_norm_factor = variance_norm_factor * correction - mean * mean;
             variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
 
-            for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
+            for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ )
             {
                 float stage_sum = 0.f;
-                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-                float stagethreshold = as_float(stageinfo.y);
-                for(int nodeloop = 0; nodeloop < stageinfo.x; )
+                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+                int stagecount = stageinfo->count;
+                float stagethreshold = stageinfo->threshold;
+                for(int nodeloop = 0; nodeloop < stagecount; )
                 {
-                    __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+                    __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                        (((__global uchar*)nodeptr) + nodecounter * sizeof(GpuHidHaarTreeNode));
 
                     int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                     int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
@@ -418,7 +423,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 #endif
                 }
 
-                result = (stage_sum >= stagethreshold);
+                result = (stage_sum >= stagethreshold) ? 1 : 0;
             }
             if(factor < 2)
             {
@@ -447,14 +452,17 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                 lclcount[0]=0;
                 barrier(CLK_LOCAL_MEM_FENCE);
 
-                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
-                float stagethreshold = as_float(stageinfo.y);
+                //int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                    ((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
+                int stagecount = stageinfo->count;
+                float stagethreshold = stageinfo->threshold;
 
                 int perfscale = queuecount > 4 ? 3 : 2;
                 int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
                 int lcl_compute_win = lcl_sz >> perfscale;
                 int lcl_compute_win_id = (lcl_id >>(6-perfscale));
-                int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
+                int lcl_loops = (stagecount + lcl_compute_win -1) >> (6-perfscale);
                 int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
                 for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
                 {
@@ -469,10 +477,10 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
                         float part_sum = 0.f;
                         const int stump_factor = STUMP_BASED ? 1 : 2;
                         int root_offset = 0;
-                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
+                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stagecount;)
                         {
-                            __global GpuHidHaarTreeNode* currentnodeptr =
-                                nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
+                            __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                                    (((__global uchar*)nodeptr) + sizeof(GpuHidHaarTreeNode) * ((nodecounter + tempnodecounter) * stump_factor + root_offset));
 
                             int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
                             int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
@@ -549,7 +557,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
 
                 queuecount = lclcount[0];
                 barrier(CLK_LOCAL_MEM_FENCE);
-                nodecounter += stageinfo.x;
+                nodecounter += stagecount;
             }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
 
             if(lcl_id<queuecount)
diff --git a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
index a2feb82f4666fe1e8c1f4516dce7ed1ed4eee9d6..09a26760b3452e025000f81f47f3975edcc917a5 100644
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -59,13 +59,13 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
     int right __attribute__((aligned(4)));
 }
 GpuHidHaarTreeNode;
-typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
-{
-    int count __attribute__((aligned(4)));
-    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
-    float *alpha __attribute__((aligned(8)));
-}
-GpuHidHaarClassifier;
+//typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
+//{
+//    int count __attribute__((aligned(4)));
+//    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
+//    float *alpha __attribute__((aligned(8)));
+//}
+//GpuHidHaarClassifier;
 typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
 {
     int  count __attribute__((aligned(4)));
@@ -77,29 +77,29 @@ typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
     int reserved3 __attribute__((aligned(8)));
 }
 GpuHidHaarStageClassifier;
-typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
-{
-    int  count __attribute__((aligned(4)));
-    int  is_stump_based __attribute__((aligned(4)));
-    int  has_tilted_features __attribute__((aligned(4)));
-    int  is_tree __attribute__((aligned(4)));
-    int pq0 __attribute__((aligned(4)));
-    int pq1 __attribute__((aligned(4)));
-    int pq2 __attribute__((aligned(4)));
-    int pq3 __attribute__((aligned(4)));
-    int p0 __attribute__((aligned(4)));
-    int p1 __attribute__((aligned(4)));
-    int p2 __attribute__((aligned(4)));
-    int p3 __attribute__((aligned(4)));
-    float inv_window_area __attribute__((aligned(4)));
-} GpuHidHaarClassifierCascade;
+//typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
+//{
+//    int  count __attribute__((aligned(4)));
+//    int  is_stump_based __attribute__((aligned(4)));
+//    int  has_tilted_features __attribute__((aligned(4)));
+//    int  is_tree __attribute__((aligned(4)));
+//    int pq0 __attribute__((aligned(4)));
+//    int pq1 __attribute__((aligned(4)));
+//    int pq2 __attribute__((aligned(4)));
+//    int pq3 __attribute__((aligned(4)));
+//    int p0 __attribute__((aligned(4)));
+//    int p1 __attribute__((aligned(4)));
+//    int p2 __attribute__((aligned(4)));
+//    int p3 __attribute__((aligned(4)));
+//    float inv_window_area __attribute__((aligned(4)));
+//} GpuHidHaarClassifierCascade;
 
 __kernel void gpuRunHaarClassifierCascade_scaled2(
-    global GpuHidHaarStageClassifier *stagecascadeptr,
+    global GpuHidHaarStageClassifier *stagecascadeptr_,
     global int4 *info,
-    global GpuHidHaarTreeNode *nodeptr,
+    global GpuHidHaarTreeNode *nodeptr_,
     global const int *restrict sum,
-    global const float   *restrict sqsum,
+    global const float *restrict sqsum,
     global int4 *candidate,
     const int rows,
     const int cols,
@@ -132,8 +132,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
     int max_idx = rows * cols - 1;
     for (int scalei = 0; scalei < loopcount; scalei++)
     {
-        int4 scaleinfo1;
-        scaleinfo1 = info[scalei];
+        int4 scaleinfo1 = info[scalei];
         int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
         int totalgrp = scaleinfo1.y & 0xffff;
         float factor = as_float(scaleinfo1.w);
@@ -174,15 +173,18 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                 for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
                 {
                     float stage_sum = 0.f;
-                    int   stagecount = stagecascadeptr[stageloop].count;
+                    __global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
+                        (((__global uchar*)stagecascadeptr_)+stageloop*sizeof(GpuHidHaarStageClassifier));
+                    int stagecount = stageinfo->count;
                     for (int nodeloop = 0; nodeloop < stagecount;)
                     {
-                        __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
+                        __global GpuHidHaarTreeNode* currentnodeptr = (__global GpuHidHaarTreeNode*)
+                            (((__global uchar*)nodeptr_) + nodecounter * sizeof(GpuHidHaarTreeNode));
                         int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
                         int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
                         int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
                         float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                        float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
+                        float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
                         float nodethreshold  = w.w * variance_norm_factor;
 
                         info1.x += p_offset;
@@ -204,7 +206,7 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                                      sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
                         + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
 
-                        bool passThres = classsum >= nodethreshold;
+                        bool passThres = (classsum >= nodethreshold) ? 1 : 0;
 
 #if STUMP_BASED
                         stage_sum += passThres ? alpha3.y : alpha3.x;
@@ -234,7 +236,8 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
                         }
 #endif
                     }
-                    result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
+
+                    result = (stage_sum >= stageinfo->threshold) ? 1 : 0;
                 }
 
                 barrier(CLK_LOCAL_MEM_FENCE);
@@ -281,11 +284,14 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
         }
     }
 }
-__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
+__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, const int nodenum)
 {
-    int counter = get_global_id(0);
+    const int counter = get_global_id(0);
     int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
-    GpuHidHaarTreeNode t1 = *(orinode + counter);
+    GpuHidHaarTreeNode t1 = *(__global GpuHidHaarTreeNode*)
+        (((__global uchar*)orinode) + counter * sizeof(GpuHidHaarTreeNode));
+    __global GpuHidHaarTreeNode* pNew = (__global GpuHidHaarTreeNode*)
+        (((__global uchar*)newnode) + (counter + nodenum) * sizeof(GpuHidHaarTreeNode));
 
     #pragma unroll
     for (i = 0; i < 3; i++)
@@ -297,22 +303,21 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
     }
 
     t1.weight[0] = -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]);
-    counter += nodenum;
 
     #pragma unroll
     for (i = 0; i < 3; i++)
     {
-        newnode[counter].p[i][0] = tr_x[i];
-        newnode[counter].p[i][1] = tr_y[i];
-        newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
-        newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
-        newnode[counter].weight[i] = t1.weight[i] * weight_scale;
+        pNew->p[i][0] = tr_x[i];
+        pNew->p[i][1] = tr_y[i];
+        pNew->p[i][2] = tr_x[i] + tr_w[i];
+        pNew->p[i][3] = tr_y[i] + tr_h[i];
+        pNew->weight[i] = t1.weight[i] * weight_scale;
     }
 
-    newnode[counter].left = t1.left;
-    newnode[counter].right = t1.right;
-    newnode[counter].threshold = t1.threshold;
-    newnode[counter].alpha[0] = t1.alpha[0];
-    newnode[counter].alpha[1] = t1.alpha[1];
-    newnode[counter].alpha[2] = t1.alpha[2];
+    pNew->left = t1.left;
+    pNew->right = t1.right;
+    pNew->threshold = t1.threshold;
+    pNew->alpha[0] = t1.alpha[0];
+    pNew->alpha[1] = t1.alpha[1];
+    pNew->alpha[2] = t1.alpha[2];
 }
diff --git a/modules/ocl/src/opencl/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl
index 63e410297b372b5a89829a8876030c00c3b98f86..85631be368b6780ef86c7e35ba947f2782bc9365 100644
--- a/modules/ocl/src/opencl/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -74,11 +74,11 @@ __kernel void threshold(__global const T * restrict src, int src_offset, int src
         VT vthresh = (VT)(thresh);
 
 #ifdef THRESH_BINARY
-        VT vecValue = sdata > vthresh ? max_val : (VT)(0);
+        VT vecValue = sdata > vthresh ? (VT)max_val : (VT)(0);
 #elif defined THRESH_BINARY_INV
-        VT vecValue = sdata > vthresh ? (VT)(0) : max_val;
+        VT vecValue = sdata > vthresh ? (VT)(0) : (VT)max_val;
 #elif defined THRESH_TRUNC
-        VT vecValue = sdata > vthresh ? thresh : sdata;
+        VT vecValue = sdata > vthresh ? (VT)thresh : sdata;
 #elif defined THRESH_TOZERO
         VT vecValue = sdata > vthresh ? sdata : (VT)(0);
 #elif defined THRESH_TOZERO_INV
diff --git a/platforms/android/service/engine/AndroidManifest.xml b/platforms/android/service/engine/AndroidManifest.xml
index 162d31eb02866200337da3efb46b961418bc87da..7cae6ce8a027204ff2d1d48bb24fbcba56fc58a6 100644
--- a/platforms/android/service/engine/AndroidManifest.xml
+++ b/platforms/android/service/engine/AndroidManifest.xml
@@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.opencv.engine"
-    android:versionCode="214@ANDROID_PLATFORM_VERSION_CODE@"
-    android:versionName="2.14" >
+    android:versionCode="216@ANDROID_PLATFORM_VERSION_CODE@"
+    android:versionName="2.16" >
 
     <uses-sdk android:minSdkVersion="@ANDROID_NATIVE_API_LEVEL@" />
     <uses-feature android:name="android.hardware.touchscreen" android:required="false"/>
diff --git a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
index 64ea70dae84d792dd200ec36032379ab59d7f90b..98ea828747f13435dfcb099cb263448d25e90c47 100644
--- a/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
+++ b/platforms/android/service/engine/jni/NativeService/PackageInfo.cpp
@@ -170,7 +170,7 @@ inline string JoinPlatform(int platform)
     return result;
 }
 
-inline int SplitPlatfrom(const vector<string>& features)
+inline int SplitPlatform(const vector<string>& features)
 {
     int result = 0;
 
@@ -419,7 +419,7 @@ InstallPath(install_path)
             return;
         }
 
-        Platform = SplitPlatfrom(features);
+        Platform = SplitPlatform(features);
         if (PLATFORM_UNKNOWN != Platform)
         {
             switch (Platform)
diff --git a/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
index 8637dfee30672fc868bee62fd62a262221813a42..83dd9c27e1ea57b76694a926958d4bc37fbff425 100644
--- a/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
+++ b/platforms/android/service/engine/jni/Tests/HardwareDetectionTest.cpp
@@ -170,7 +170,7 @@ TEST(CpuID, CheckVFPv3)
     EXPECT_TRUE(cpu_id & FEATURES_HAS_VFPv3);
 }
 
-TEST(PlatfromDetector, CheckTegra)
+TEST(PlatformDetector, CheckTegra)
 {
     EXPECT_NE(PLATFORM_UNKNOWN, DetectKnownPlatforms());
 }
diff --git a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java b/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
index 4e9050fa4dc12210b1d012ecdad3ba8dbb11a765..8e8389dcc7a756c1340a47f4004f910b465bea5f 100644
--- a/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
+++ b/platforms/android/service/engine/src/org/opencv/engine/manager/ManagerActivity.java
@@ -90,28 +90,28 @@ public class ManagerActivity extends Activity
         mInstalledPackageView.setAdapter(mInstalledPacksAdapter);
 
         TextView HardwarePlatformView = (TextView)findViewById(R.id.HardwareValue);
-        int Platfrom = HardwareDetector.DetectKnownPlatforms();
+        int Platform = HardwareDetector.DetectKnownPlatforms();
         int CpuId = HardwareDetector.GetCpuID();
 
-        if (HardwareDetector.PLATFORM_UNKNOWN != Platfrom)
+        if (HardwareDetector.PLATFORM_UNKNOWN != Platform)
         {
-            if (HardwareDetector.PLATFORM_TEGRA == Platfrom)
+            if (HardwareDetector.PLATFORM_TEGRA == Platform)
             {
                 HardwarePlatformView.setText("Tegra");
             }
-            else if (HardwareDetector.PLATFORM_TEGRA2 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA2 == Platform)
             {
                 HardwarePlatformView.setText("Tegra 2");
             }
-            else if (HardwareDetector.PLATFORM_TEGRA3 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA3 == Platform)
             {
                 HardwarePlatformView.setText("Tegra 3");
             }
-            else if (HardwareDetector.PLATFORM_TEGRA4i == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA4i == Platform)
             {
                 HardwarePlatformView.setText("Tegra 4i");
             }
-            else if (HardwareDetector.PLATFORM_TEGRA4 == Platfrom)
+            else if (HardwareDetector.PLATFORM_TEGRA4 == Platform)
             {
                 HardwarePlatformView.setText("Tegra 4");
             }
diff --git a/platforms/android/service/readme.txt b/platforms/android/service/readme.txt
index 1e757a0e5f91e5ebb6fa1a030969c9866cd78f15..a280b506f01ed6d9f68634c96bfd39ab444e0c57 100644
--- a/platforms/android/service/readme.txt
+++ b/platforms/android/service/readme.txt
@@ -14,20 +14,20 @@ manually using adb tool:
 
 .. code-block:: sh
 
-    adb install OpenCV-2.4.7-android-sdk/apk/OpenCV_2.4.7_Manager_2.14_<platform>.apk
+    adb install OpenCV-2.4.7.1-android-sdk/apk/OpenCV_2.4.7.1_Manager_2.15_<platform>.apk
 
 Use the table below to determine proper OpenCV Manager package for your device:
 
-+------------------------------+--------------+----------------------------------------------------+
-| Hardware Platform            | Android ver. | Package name                                       |
-+==============================+==============+====================================================+
-| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_armv7a-neon.apk          |
-+------------------------------+--------------+----------------------------------------------------+
-| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.7_Manager_2.14_armv7a-neon-android8.apk |
-+------------------------------+--------------+----------------------------------------------------+
-| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_armeabi.apk              |
-+------------------------------+--------------+----------------------------------------------------+
-| Intel x86                    |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_x86.apk                  |
-+------------------------------+--------------+----------------------------------------------------+
-| MIPS                         |    >= 2.3    | OpenCV_2.4.7_Manager_2.14_mips.apk                 |
-+------------------------------+--------------+----------------------------------------------------+
++------------------------------+--------------+------------------------------------------------------+
+| Hardware Platform            | Android ver. | Package name                                         |
++==============================+==============+======================================================+
+| armeabi-v7a (ARMv7-A + NEON) |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon.apk          |
++------------------------------+--------------+------------------------------------------------------+
+| armeabi-v7a (ARMv7-A + NEON) |     = 2.2    | OpenCV_2.4.7.1_Manager_2.15_armv7a-neon-android8.apk |
++------------------------------+--------------+------------------------------------------------------+
+| armeabi (ARMv5, ARMv6)       |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_armeabi.apk              |
++------------------------------+--------------+------------------------------------------------------+
+| Intel x86                    |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_x86.apk                  |
++------------------------------+--------------+------------------------------------------------------+
+| MIPS                         |    >= 2.3    | OpenCV_2.4.7.1_Manager_2.15_mips.apk                 |
++------------------------------+--------------+------------------------------------------------------+
diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
index 2aa71f047c4373c7129789329f308bf0c3037552..4b0bf011d9f8f5c5945dc7252ab730377b1942fb 100644
--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
@@ -29,6 +29,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
     ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/cudafilters/include")
   endif()
 
+  if(HAVE_opencv_ocl)
+    ocv_include_directories("${OpenCV_SOURCE_DIR}/modules/ocl/include")
+  endif()
+
   if(CMAKE_COMPILER_IS_GNUCXX AND NOT ENABLE_NOISY_WARNINGS)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-function")
   endif()
@@ -56,6 +60,10 @@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_FOUND)
       target_link_libraries(${the_target} opencv_cudaarithm opencv_cudafilters)
     endif()
 
+    if(HAVE_opencv_ocl)
+      target_link_libraries(${the_target} opencv_ocl)
+    endif()
+
     set_target_properties(${the_target} PROPERTIES
       OUTPUT_NAME "cpp-${sample_kind}-${name}"
       PROJECT_LABEL "(${sample_KIND}) ${name}")
diff --git a/samples/cpp/bagofwords_classification.cpp b/samples/cpp/bagofwords_classification.cpp
index b841b63f8b8f8d469d71874e70b9790f0f6a78f4..20ff1dcfbb8b9458b62e02f8e9866d75bdfbe793 100644
--- a/samples/cpp/bagofwords_classification.cpp
+++ b/samples/cpp/bagofwords_classification.cpp
@@ -1,8 +1,13 @@
+#include "opencv2/opencv_modules.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv2/features2d/features2d.hpp"
 #include "opencv2/nonfree/nonfree.hpp"
 #include "opencv2/ml/ml.hpp"
+#ifdef HAVE_OPENCV_OCL
+#define _OCL_SVM_ 1 //select whether using ocl::svm method or not, default is using
+#include "opencv2/ocl/ocl.hpp"
+#endif
 
 #include <fstream>
 #include <iostream>
@@ -2373,9 +2378,15 @@ static void setSVMTrainAutoParams( CvParamGrid& c_grid, CvParamGrid& gamma_grid,
     degree_grid.step = 0;
 }
 
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+static void trainSVMClassifier( cv::ocl::CvSVM_OCL& svm, const SVMTrainParamsExt& svmParamsExt, const string& objClassName, VocData& vocData,
+                               Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
+                               const string& resPath )
+#else
 static void trainSVMClassifier( CvSVM& svm, const SVMTrainParamsExt& svmParamsExt, const string& objClassName, VocData& vocData,
                          Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
                          const string& resPath )
+#endif
 {
     /* first check if a previously trained svm for the current class has been saved to file */
     string svmFilename = resPath + svmsDir + "/" + objClassName + ".xml.gz";
@@ -2448,9 +2459,15 @@ static void trainSVMClassifier( CvSVM& svm, const SVMTrainParamsExt& svmParamsEx
     }
 }
 
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+static void computeConfidences( cv::ocl::CvSVM_OCL& svm, const string& objClassName, VocData& vocData,
+                               Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
+                               const string& resPath )
+#else
 static void computeConfidences( CvSVM& svm, const string& objClassName, VocData& vocData,
                          Ptr<BOWImgDescriptorExtractor>& bowExtractor, const Ptr<FeatureDetector>& fdetector,
                          const string& resPath )
+#endif
 {
     cout << "*** CALCULATING CONFIDENCES FOR CLASS " << objClassName << " ***" << endl;
     cout << "CALCULATING BOW VECTORS FOR TEST SET OF " << objClassName << "..." << endl;
@@ -2589,7 +2606,11 @@ int main(int argc, char** argv)
     for( size_t classIdx = 0; classIdx < objClasses.size(); ++classIdx )
     {
         // Train a classifier on train dataset
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+        cv::ocl::CvSVM_OCL svm;
+#else
         CvSVM svm;
+#endif
         trainSVMClassifier( svm, svmTrainParamsExt, objClasses[classIdx], vocData,
                             bowExtractor, featureDetector, resPath );
 
diff --git a/samples/cpp/points_classifier.cpp b/samples/cpp/points_classifier.cpp
index ecf8ae8a30e2f93784afa0cc42d2e6253e27994a..26858da886e1c9c04b6268fb219484803d1c1f8d 100644
--- a/samples/cpp/points_classifier.cpp
+++ b/samples/cpp/points_classifier.cpp
@@ -1,6 +1,12 @@
+#include "opencv2/opencv_modules.hpp"
 #include "opencv2/core/core.hpp"
 #include "opencv2/ml/ml.hpp"
 #include "opencv2/highgui/highgui.hpp"
+#ifdef HAVE_OPENCV_OCL
+#define _OCL_KNN_ 1 // select whether using ocl::KNN method or not, default is using
+#define _OCL_SVM_ 1 // select whether using ocl::svm method or not, default is using
+#include "opencv2/ocl/ocl.hpp"
+#endif
 
 #include <stdio.h>
 
@@ -133,7 +139,14 @@ static void find_decision_boundary_KNN( int K )
     prepare_train_data( trainSamples, trainClasses );
 
     // learn classifier
+#if defined HAVE_OPENCV_OCL && _OCL_KNN_
+    cv::ocl::KNearestNeighbour knnClassifier;
+    Mat temp, result;
+    knnClassifier.train(trainSamples, trainClasses, temp, false, K);
+    cv::ocl::oclMat testSample_ocl, reslut_ocl;
+#else
     CvKNearest knnClassifier( trainSamples, trainClasses, Mat(), false, K );
+#endif
 
     Mat testSample( 1, 2, CV_32FC1 );
     for( int y = 0; y < img.rows; y += testStep )
@@ -142,9 +155,19 @@ static void find_decision_boundary_KNN( int K )
         {
             testSample.at<float>(0) = (float)x;
             testSample.at<float>(1) = (float)y;
+#if defined HAVE_OPENCV_OCL && _OCL_KNN_
+            testSample_ocl.upload(testSample);
+
+            knnClassifier.find_nearest(testSample_ocl, K, reslut_ocl);
+
+            reslut_ocl.download(result);
+            int response = saturate_cast<int>(result.at<float>(0));
+            circle(imgDst, Point(x, y), 1, classColors[response]);
+#else
 
             int response = (int)knnClassifier.find_nearest( testSample, K );
             circle( imgDst, Point(x,y), 1, classColors[response] );
+#endif
         }
     }
 }
@@ -159,7 +182,11 @@ static void find_decision_boundary_SVM( CvSVMParams params )
     prepare_train_data( trainSamples, trainClasses );
 
     // learn classifier
+#if defined HAVE_OPENCV_OCL && _OCL_SVM_
+    cv::ocl::CvSVM_OCL svmClassifier(trainSamples, trainClasses, Mat(), Mat(), params);
+#else
     CvSVM svmClassifier( trainSamples, trainClasses, Mat(), Mat(), params );
+#endif
 
     Mat testSample( 1, 2, CV_32FC1 );
     for( int y = 0; y < img.rows; y += testStep )
@@ -178,7 +205,7 @@ static void find_decision_boundary_SVM( CvSVMParams params )
     for( int i = 0; i < svmClassifier.get_support_vector_count(); i++ )
     {
         const float* supportVector = svmClassifier.get_support_vector(i);
-        circle( imgDst, Point(supportVector[0],supportVector[1]), 5, Scalar(255,255,255), -1 );
+        circle( imgDst, Point(saturate_cast<int>(supportVector[0]),saturate_cast<int>(supportVector[1])), 5, CV_RGB(255,255,255), -1 );
     }
 
 }
diff --git a/samples/ocl/facedetect.cpp b/samples/ocl/facedetect.cpp
index 781efa066f526a47e2ac0ed8a5d739e3fb3c9346..b452ab8d8993fa0da16f4a9a225b8136b0f08a92 100644
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@@ -8,11 +8,16 @@
 #include <iostream>
 #include <stdio.h>
 
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+    # include <thread>
+#endif
 
 using namespace std;
 using namespace cv;
 #define LOOP_NUM 1
 
+///////////////////////////single-threading faces detecting///////////////////////////////
+
 const static Scalar colors[] =  { CV_RGB(0,0,255),
                                   CV_RGB(0,128,255),
                                   CV_RGB(0,255,255),
@@ -26,7 +31,7 @@ const static Scalar colors[] =  { CV_RGB(0,0,255),
 
 int64 work_begin = 0;
 int64 work_end = 0;
-string outputName;
+string inputName, outputName, cascadeName;
 
 static void workBegin()
 {
@@ -61,41 +66,17 @@ static void Draw(Mat& img, vector<Rect>& faces, double scale);
 // Else if will return (total diff of each cpu and gpu rects covered pixels)/(total cpu rects covered pixels)
 double checkRectSimilarity(Size sz, vector<Rect>& cpu_rst, vector<Rect>& gpu_rst);
 
-int main( int argc, const char** argv )
+static int facedetect_one_thread(bool useCPU, double scale )
 {
-    const char* keys =
-        "{ h  help       | false       | print help message }"
-        "{ i  input      |             | specify input image }"
-        "{ t  template   | haarcascade_frontalface_alt.xml |"
-        " specify template file path }"
-        "{ c  scale      |   1.0       | scale image }"
-        "{ s  use_cpu    | false       | use cpu or gpu to process the image }"
-        "{ o  output     | facedetect_output.jpg  |"
-        " specify output image save path(only works when input is images) }";
-
-    CommandLineParser cmd(argc, argv, keys);
-    if (cmd.get<bool>("help"))
-    {
-        cout << "Usage : facedetect [options]" << endl;
-        cout << "Available options:" << endl;
-        cmd.printMessage();
-        return EXIT_SUCCESS;
-    }
-
     CvCapture* capture = 0;
     Mat frame, frameCopy0, frameCopy, image;
 
-    bool useCPU = cmd.get<bool>("s");
-    string inputName = cmd.get<string>("i");
-    outputName = cmd.get<string>("o");
-    string cascadeName = cmd.get<string>("t");
-    double scale = cmd.get<double>("c");
     ocl::OclCascadeClassifier cascade;
     CascadeClassifier  cpu_cascade;
 
     if( !cascade.load( cascadeName ) || !cpu_cascade.load(cascadeName) )
     {
-        cout << "ERROR: Could not load classifier cascade" << endl;
+        cout << "ERROR: Could not load classifier cascade: " << cascadeName << endl;
         return EXIT_FAILURE;
     }
 
@@ -186,9 +167,114 @@ int main( int argc, const char** argv )
     }
 
     cvDestroyWindow("result");
+    std::cout<< "single-threaded sample has finished" <<std::endl;
     return 0;
 }
 
+///////////////////////////////////////detectfaces with multithreading////////////////////////////////////////////
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+
+#define MAX_THREADS 10
+
+static void detectFaces(std::string fileName)
+{
+    ocl::OclCascadeClassifier cascade;
+    if(!cascade.load(cascadeName))
+    {
+        std::cout << "ERROR: Could not load classifier cascade: " << cascadeName << std::endl;
+        return;
+    }
+
+    Mat img = imread(fileName, CV_LOAD_IMAGE_COLOR);
+    if (img.empty())
+    {
+        std::cout << "cann't open file " + fileName <<std::endl;
+        return;
+    }
+
+    ocl::oclMat d_img;
+    d_img.upload(img);
+
+    std::vector<Rect> oclfaces;
+    cascade.detectMultiScale(d_img, oclfaces,  1.1, 3, 0|CV_HAAR_SCALE_IMAGE, Size(30, 30), Size(0, 0));
+
+    for(unsigned int i = 0; i<oclfaces.size(); i++)
+        rectangle(img, Point(oclfaces[i].x, oclfaces[i].y), Point(oclfaces[i].x + oclfaces[i].width, oclfaces[i].y + oclfaces[i].height), colors[i%8], 3);
+
+    std::string::size_type pos = outputName.rfind('.');
+    std::string outputNameTid = outputName + '-' + std::to_string(_threadid);
+    if(pos == std::string::npos)
+    {
+        std::cout << "Invalid output file name: " << outputName << std::endl;
+    }
+    else
+    {
+        outputNameTid = outputName.substr(0, pos) + "_" + std::to_string(_threadid) + outputName.substr(pos);
+        imwrite(outputNameTid, img);
+    }
+    imshow(outputNameTid, img);
+    waitKey(0);
+}
+
+static void facedetect_multithreading(int nthreads)
+{
+    int thread_number = MAX_THREADS < nthreads ? MAX_THREADS : nthreads;
+    std::vector<std::thread> threads;
+    for(int i = 0; i<thread_number; i++)
+        threads.push_back(std::thread(detectFaces, inputName));
+    for(int i = 0; i<thread_number; i++)
+        threads[i].join();
+}
+#endif
+
+int main( int argc, const char** argv )
+{
+
+    const char* keys =
+        "{ h help       | false       | print help message }"
+        "{ i input      |             | specify input image }"
+        "{ t template   | haarcascade_frontalface_alt.xml |"
+        " specify template file path }"
+        "{ c scale      |   1.0       | scale image }"
+        "{ s use_cpu    | false       | use cpu or gpu to process the image }"
+        "{ o output     | facedetect_output.jpg  |"
+        " specify output image save path(only works when input is images) }"
+        "{ n thread_num |      1      | set number of threads >= 1 }";
+
+    CommandLineParser cmd(argc, argv, keys);
+    if (cmd.has("help"))
+    {
+        cout << "Usage : facedetect [options]" << endl;
+        cout << "Available options:" << endl;
+        cmd.printMessage();
+        return EXIT_SUCCESS;
+    }
+    bool useCPU = cmd.get<bool>("s");
+    inputName = cmd.get<string>("i");
+    outputName = cmd.get<string>("o");
+    cascadeName = cmd.get<string>("t");
+    double scale = cmd.get<double>("c");
+    int n = cmd.get<int>("n");
+
+    if(n > 1)
+    {
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+            std::cout<<"multi-threaded sample is running" <<std::endl;
+            facedetect_multithreading(n);
+            std::cout<<"multi-threaded sample has finished" <<std::endl;
+            return 0;
+#else
+            std::cout << "std::thread is not supported, running a single-threaded version" << std::endl;
+#endif
+    }
+    if (n<0)
+        std::cout<<"incorrect number of threads:" << n << ", running a single-threaded version" <<std::endl;
+    else
+        std::cout<<"single-threaded sample is running" <<std::endl;
+    return facedetect_one_thread(useCPU, scale);
+
+}
+
 void detect( Mat& img, vector<Rect>& faces,
              ocl::OclCascadeClassifier& cascade,
              double scale)