Merge branch '2.4' of https://github.com/Itseez/opencv into 2.4_pyrup_fix

1ecc7659 · peng xiao · df3997b1 · 2a646f76 · 1ecc7659 · 1ecc7659
58 changed file
--- a/3rdparty/libjasper/CMakeLists.txt
+++ b/3rdparty/libjasper/CMakeLists.txt
@@ -23,7 +23,8 @@ if(WIN32 AND NOT MINGW)
  add_definitions(-DJAS_WIN_MSVC_BUILD)
 endif(WIN32 AND NOT MINGW)

-ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes -Wmissing-declarations -Wunused -Wshadow -Wsign-compare)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes
+                                   -Wno-unused-but-set-parameter -Wmissing-declarations -Wunused -Wshadow -Wsign-compare)
 ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005


--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -594,12 +594,15 @@ endif()
 status("")
 status("  GUI: ")

-if(HAVE_QT)
+if(HAVE_QT5)
+  status("    QT 5.x:"            HAVE_QT        THEN "YES (ver ${Qt5Core_VERSION_STRING})" ELSE NO)
+  status("    QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${Qt5OpenGL_LIBRARIES} ${Qt5OpenGL_VERSION_STRING})" ELSE NO)
+elseif(HAVE_QT)
  status("    QT 4.x:"            HAVE_QT        THEN "YES (ver ${QT_VERSION_MAJOR}.${QT_VERSION_MINOR}.${QT_VERSION_PATCH} ${QT_EDITION})" ELSE NO)
  status("    QT OpenGL support:" HAVE_QT_OPENGL THEN "YES (${QT_QTOPENGL_LIBRARY})" ELSE NO)
 else()
  if(DEFINED WITH_QT)
-    status("    QT 4.x:" NO)
+    status("    QT:" NO)
  endif()
  if(DEFINED WITH_WIN32UI)
    status("    Win32 UI:" HAVE_WIN32UI THEN YES ELSE NO)

--- a/cmake/OpenCVDetectAndroidSDK.cmake
+++ b/cmake/OpenCVDetectAndroidSDK.cmake
@@ -176,7 +176,8 @@ macro(android_get_compatible_target VAR)
 endmacro()

 unset(__android_project_chain CACHE)
-#add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
+
+# add_android_project(target_name ${path} NATIVE_DEPS opencv_core LIBRARY_DEPS ${OpenCV_BINARY_DIR} SDK_TARGET 11)
 macro(add_android_project target path)
  # parse arguments
  set(android_proj_arglist NATIVE_DEPS LIBRARY_DEPS SDK_TARGET IGNORE_JAVA IGNORE_MANIFEST)
@@ -212,6 +213,16 @@ macro(add_android_project target path)
    ocv_check_dependencies(${android_proj_NATIVE_DEPS} opencv_java)
  endif()

+  if(EXISTS "${path}/jni/Android.mk" )
+    # find if native_app_glue is used
+    file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
+    if(NATIVE_APP_GLUE)
+      if(ANDROID_NATIVE_API_LEVEL LESS 9 OR NOT EXISTS "${ANDROID_NDK}/sources/android/native_app_glue")
+        set(OCV_DEPENDENCIES_FOUND FALSE)
+      endif()
+    endif()
+  endif()
+
  if(OCV_DEPENDENCIES_FOUND AND android_proj_sdk_target AND ANDROID_EXECUTABLE AND ANT_EXECUTABLE AND ANDROID_TOOLS_Pkg_Revision GREATER 13 AND EXISTS "${path}/${ANDROID_MANIFEST_FILE}")

    project(${target})
@@ -268,9 +279,6 @@ macro(add_android_project target path)
      file(STRINGS "${path}/jni/Android.mk" JNI_LIB_NAME REGEX "LOCAL_MODULE[ ]*:=[ ]*.*" )
      string(REGEX REPLACE "LOCAL_MODULE[ ]*:=[ ]*([a-zA-Z_][a-zA-Z_0-9]*)[ ]*" "\\1" JNI_LIB_NAME "${JNI_LIB_NAME}")

-      # find using of native app glue to determine native activity
-      file(STRINGS "${path}/jni/Android.mk" NATIVE_APP_GLUE REGEX ".*(call import-module,android/native_app_glue)" )
-
      if(JNI_LIB_NAME)
        ocv_include_modules_recurse(${android_proj_NATIVE_DEPS})
        ocv_include_directories("${path}/jni")
@@ -291,9 +299,9 @@ macro(add_android_project target path)
            )

        get_target_property(android_proj_jni_location "${JNI_LIB_NAME}" LOCATION)
-    if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
-        add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
-    endif()
+        if (NOT (CMAKE_BUILD_TYPE MATCHES "debug"))
+            add_custom_command(TARGET ${JNI_LIB_NAME} POST_BUILD COMMAND ${CMAKE_STRIP} --strip-unneeded "${android_proj_jni_location}")
+        endif()
      endif()
    endif()


--- a/cmake/OpenCVDetectCXXCompiler.cmake
+++ b/cmake/OpenCVDetectCXXCompiler.cmake
@@ -101,7 +101,7 @@ endif()

 if(MSVC64 OR MINGW64)
  set(X86_64 1)
-elseif(MSVC AND NOT CMAKE_CROSSCOMPILING)
+elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
  set(X86 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
  set(X86_64 1)

--- a/cmake/OpenCVFindLibsGUI.cmake
+++ b/cmake/OpenCVFindLibsGUI.cmake
@@ -13,12 +13,31 @@ if(WITH_WIN32UI)
 endif(WITH_WIN32UI)

 # --- QT4 ---
-ocv_clear_vars(HAVE_QT)
+ocv_clear_vars(HAVE_QT HAVE_QT5)
 if(WITH_QT)
-  find_package(Qt4)
-  if(QT4_FOUND)
-    set(HAVE_QT TRUE)
-    add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
+  if(NOT CMAKE_VERSION VERSION_LESS 2.8.3 AND NOT WITH_QT EQUAL 4)
+    find_package(Qt5Core)
+    find_package(Qt5Gui)
+    find_package(Qt5Widgets)
+    find_package(Qt5Test)
+    find_package(Qt5Concurrent)
+    if(Qt5Core_FOUND AND Qt5Gui_FOUND AND Qt5Widgets_FOUND AND Qt5Test_FOUND AND Qt5Concurrent_FOUND)
+      set(HAVE_QT5 ON)
+      set(HAVE_QT  ON)
+      add_definitions(-DHAVE_QT)
+      find_package(Qt5OpenGL)
+      if(Qt5OpenGL_FOUND)
+        set(QT_QTOPENGL_FOUND ON)
+      endif()
+    endif()
+  endif()
+
+  if(NOT HAVE_QT)
+    find_package(Qt4)
+    if(QT4_FOUND)
+      set(HAVE_QT TRUE)
+      add_definitions(-DHAVE_QT) # We need to define the macro this way, using cvconfig.h does not work
+    endif()
  endif()
 endif()


--- a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
+++ b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
@@ -1477,6 +1477,6 @@ The function reconstructs 3-dimensional points (in homogeneous coordinates) by u

 .. [HH08] Hirschmuller, H. Stereo Processing by Semiglobal Matching and Mutual Information, PAMI(30), No. 2, February 2008, pp. 328-341.

-.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://gregslabaugh.name/publications/euler.pdf
+.. [Slabaugh] Slabaugh, G.G. Computing Euler angles from a rotation matrix. http://www.soi.city.ac.uk/~sbbh653/publications/euler.pdf (verified: 2013-04-15)

 .. [Zhang2000] Z. Zhang. A Flexible New Technique for Camera Calibration. IEEE Transactions on Pattern Analysis and Machine Intelligence, 22(11):1330-1334, 2000.
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -2855,8 +2855,9 @@ PCA& PCA::operator()(InputArray _data, InputArray __mean, int flags, int maxComp

    if( _mean.data )
    {
-        CV_Assert( _mean.size() == mean_sz );
+        CV_Assert( _mean.size() == mean_sz );        
        _mean.convertTo(mean, ctype);
+        covar_flags |= CV_COVAR_USE_AVG; 
    }

    calcCovarMatrix( data, covar, mean, covar_flags, ctype );

--- a/modules/flann/include/opencv2/flann/ground_truth.h
+++ b/modules/flann/include/opencv2/flann/ground_truth.h
@@ -42,7 +42,6 @@ template <typename Distance>
 void find_nearest(const Matrix<typename Distance::ElementType>& dataset, typename Distance::ElementType* query, int* matches, int nn,
                  int skip = 0, Distance distance = Distance())
 {
-    typedef typename Distance::ElementType ElementType;
    typedef typename Distance::ResultType DistanceType;
    int n = nn + skip;


--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -70,7 +70,26 @@ set(highgui_srcs

 file(GLOB highgui_ext_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")

-if(HAVE_QT)
+if(HAVE_QT5)
+  set(CMAKE_AUTOMOC ON)
+  set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+  QT5_ADD_RESOURCES(_RCC_OUTFILES src/window_QT.qrc)
+  list(APPEND highgui_srcs src/window_QT.cpp src/window_QT.h ${_RCC_OUTFILES})
+
+  foreach(dt5_dep Core Gui Widgets Test Concurrent)
+    add_definitions(${Qt5${dt5_dep}_DEFINITIONS})
+    include_directories(${Qt5${dt5_dep}_INCLUDE_DIRS})
+    list(APPEND HIGHGUI_LIBRARIES ${Qt5${dt5_dep}_LIBRARIES})
+  endforeach()
+
+  if(HAVE_QT_OPENGL)
+    add_definitions(${Qt5OpenGL_DEFINITIONS})
+    include_directories(${Qt5OpenGL_INCLUDE_DIRS})
+    list(APPEND HIGHGUI_LIBRARIES ${Qt5OpenGL_LIBRARIES})
+  endif()
+
+elseif(HAVE_QT)
  if (HAVE_QT_OPENGL)
    set(QT_USE_QTOPENGL TRUE)
  endif()

--- a/modules/highgui/src/cap_libv4l.cpp
+++ b/modules/highgui/src/cap_libv4l.cpp
@@ -1665,6 +1665,17 @@ static int icvSetPropertyCAM_V4L(CvCaptureCAM_V4L* capture, int property_id, dou
            width = height = 0;
        }
        break;
+    case CV_CAP_PROP_FPS:
+        struct v4l2_streamparm setfps;
+        memset (&setfps, 0, sizeof(struct v4l2_streamparm));
+        setfps.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+        setfps.parm.capture.timeperframe.numerator = 1;
+        setfps.parm.capture.timeperframe.denominator = value;
+        if (xioctl (capture->deviceHandle, VIDIOC_S_PARM, &setfps) < 0){
+            fprintf(stderr, "HIGHGUI ERROR: V4L: Unable to set camera FPS\n");
+            retval=0;
+        }
+        break;
    default:
        retval = icvSetControl(capture, property_id, value);
    }

--- a/modules/highgui/src/grfmt_jpeg.cpp
+++ b/modules/highgui/src/grfmt_jpeg.cpp
@@ -52,6 +52,11 @@
 #include <stdio.h>
 #include <setjmp.h>

+// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
+// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
+#define mingw_getsp(...) 0
+#define __builtin_frame_address(...) 0
+
 #ifdef WIN32

 #define XMD_H // prevent redefinition of INT32

--- a/modules/highgui/src/grfmt_png.cpp
+++ b/modules/highgui/src/grfmt_png.cpp
@@ -73,6 +73,11 @@
    #pragma warning( disable: 4611 )
 #endif

+// the following defines are a hack to avoid multiple problems with frame ponter handling and setjmp
+// see http://gcc.gnu.org/ml/gcc/2011-10/msg00324.html for some details
+#define mingw_getsp(...) 0
+#define __builtin_frame_address(...) 0
+
 namespace cv
 {


--- a/modules/highgui/src/window_QT.h
+++ b/modules/highgui/src/window_QT.h
@@ -48,13 +48,13 @@
 #endif

 #include <QAbstractEventDispatcher>
-#include <QtGui/QApplication>
+#include <QApplication>
 #include <QFile>
 #include <QPushButton>
-#include <QtGui/QGraphicsView>
+#include <QGraphicsView>
 #include <QSizePolicy>
 #include <QInputDialog>
-#include <QtGui/QBoxLayout>
+#include <QBoxLayout>
 #include <QSettings>
 #include <qtimer.h>
 #include <QtConcurrentRun>
@@ -78,7 +78,7 @@
 #include <QRadioButton>
 #include <QButtonGroup>
 #include <QMenu>
-#include <QtTest/QTest>
+#include <QTest>

 //start private enum
 enum { CV_MODE_NORMAL = 0, CV_MODE_OPENGL = 1 };

--- a/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
+++ b/modules/imgproc/doc/structural_analysis_and_shape_descriptors.rst
@@ -406,7 +406,7 @@ Finds the convex hull of a point set.

    :param hull_storage: Output memory storage in the old API (``cvConvexHull2`` returns a sequence containing the convex hull points or their indices).

-    :param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The usual screen coordinate system is assumed so that the origin is at the top-left corner, x axis is oriented to the right, and y axis is oriented downwards.
+    :param clockwise: Orientation flag. If it is true, the output convex hull is oriented clockwise. Otherwise, it is oriented counter-clockwise. The assumed coordinate system has its X axis pointing to the right, and its Y axis pointing upwards.

    :param orientation: Convex hull orientation parameter in the old API, ``CV_CLOCKWISE`` or ``CV_COUNTERCLOCKWISE``.


--- a/modules/imgproc/src/floodfill.cpp
+++ b/modules/imgproc/src/floodfill.cpp
@@ -93,7 +93,6 @@ icvFloodFill_CnIR( uchar* pImage, int step, CvSize roi, CvPoint seed,
                   _Tp newVal, CvConnectedComp* region, int flags,
                   std::vector<CvFFillSegment>* buffer )
 {
-    typedef typename cv::DataType<_Tp>::channel_type _CTp;
    _Tp* img = (_Tp*)(pImage + step * seed.y);
    int i, L, R;
    int area = 0;
@@ -252,7 +251,6 @@ icvFloodFillGrad_CnIR( uchar* pImage, int step, uchar* pMask, int maskStep,
                       CvConnectedComp* region, int flags,
                       std::vector<CvFFillSegment>* buffer )
 {
-    typedef typename cv::DataType<_Tp>::channel_type _CTp;
    _Tp* img = (_Tp*)(pImage + step*seed.y);
    uchar* mask = (pMask += maskStep + 1) + maskStep*seed.y;
    int i, L, R;

--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1219,8 +1219,6 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
                            const int* yofs, const void* _beta,
                            int xmin, int xmax, int ksize )
 {
-    typedef typename HResize::value_type T;
-    typedef typename HResize::buf_type WT;
    typedef typename HResize::alpha_type AT;

    const AT* beta = (const AT*)_beta;

--- a/modules/legacy/src/blobtrackgenyml.cpp
+++ b/modules/legacy/src/blobtrackgenyml.cpp
@@ -61,7 +61,7 @@ protected:
    {
        int     ObjNum = m_TrackList.GetBlobNum();
        int     i;
-        char    video_name[1024];
+        char    video_name[1024+1];
        char*   struct_name = NULL;
        CvFileStorage* storage = cvOpenFileStorage(m_pFileName,NULL,CV_STORAGE_WRITE_TEXT);


--- a/modules/legacy/src/kdtree.cpp
+++ b/modules/legacy/src/kdtree.cpp
@@ -117,10 +117,10 @@ class CvKDTreeWrap : public CvFeatureTree {
           CvMat* results) {
    int rn = results->rows * results->cols;
    std::vector<int> inbounds;
-    dispatch_cvtype(mat, ((__treetype*)data)->
-        find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
+    assert(CV_MAT_DEPTH(mat->type) == CV_32F || CV_MAT_DEPTH(mat->type) == CV_64F);
+    ((__treetype*)data)->find_ortho_range((typename __treetype::scalar_type*)bounds_min->data.ptr,
             (typename __treetype::scalar_type*)bounds_max->data.ptr,
-             inbounds));
+             inbounds);
    std::copy(inbounds.begin(),
        inbounds.begin() + std::min((int)inbounds.size(), rn),
        (int*) results->data.ptr);

--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -802,6 +802,44 @@ namespace cv
                                        int minNeighbors, int flags, CvSize minSize = cvSize(0, 0), CvSize maxSize = cvSize(0, 0));
        };

+        class CV_EXPORTS OclCascadeClassifierBuf : public  cv::CascadeClassifier
+        {
+        public:
+            OclCascadeClassifierBuf() :
+                m_flags(0), initialized(false), m_scaleFactor(0), buffers(NULL) {}
+
+            ~OclCascadeClassifierBuf() {}
+
+            void detectMultiScale(oclMat &image, CV_OUT std::vector<cv::Rect>& faces,
+                                  double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0,
+                                  Size minSize = Size(), Size maxSize = Size());
+            void release();
+
+        private:
+            void Init(const int rows, const int cols, double scaleFactor, int flags,
+                      const int outputsz, const size_t localThreads[],
+                      CvSize minSize, CvSize maxSize);
+            void CreateBaseBufs(const int datasize, const int totalclassifier, const int flags, const int outputsz);
+            void CreateFactorRelatedBufs(const int rows, const int cols, const int flags,
+                                         const double scaleFactor, const size_t localThreads[],
+                                         CvSize minSize, CvSize maxSize);
+            void GenResult(CV_OUT std::vector<cv::Rect>& faces, const std::vector<cv::Rect> &rectList, const std::vector<int> &rweights);
+
+            int m_rows;
+            int m_cols;
+            int m_flags;
+            int m_loopcount;
+            int m_nodenum;
+            bool findBiggestObject;
+            bool initialized;
+            double m_scaleFactor;
+            Size m_minSize;
+            Size m_maxSize;
+            vector<CvSize> sizev;
+            vector<float> scalev;
+            oclMat gimg1, gsum, gsqsum;
+            void * buffers;
+        };


        /////////////////////////////// Pyramid /////////////////////////////////////
@@ -1731,6 +1769,44 @@ namespace cv
            std::vector<oclMat> datas;
            oclMat out;
        };
+        class CV_EXPORTS StereoConstantSpaceBP
+        {
+        public:
+            enum { DEFAULT_NDISP    = 128 };
+            enum { DEFAULT_ITERS    = 8   };
+            enum { DEFAULT_LEVELS   = 4   };
+            enum { DEFAULT_NR_PLANE = 4   };
+            static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane);
+            explicit StereoConstantSpaceBP(
+                int ndisp    = DEFAULT_NDISP,
+                int iters    = DEFAULT_ITERS,
+                int levels   = DEFAULT_LEVELS,
+                int nr_plane = DEFAULT_NR_PLANE,
+                int msg_type = CV_32F);
+            StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+                float max_data_term, float data_weight, float max_disc_term, float disc_single_jump,
+                int min_disp_th = 0,
+                int msg_type = CV_32F);
+            void operator()(const oclMat &left, const oclMat &right, oclMat &disparity);
+            int ndisp;
+            int iters;
+            int levels;
+            int nr_plane;
+            float max_data_term;
+            float data_weight;
+            float max_disc_term;
+            float disc_single_jump;
+            int min_disp_th;
+            int msg_type;
+            bool use_local_init_data_cost;
+        private:
+            oclMat u[2], d[2], l[2], r[2];
+            oclMat disp_selected_pyr[2];
+            oclMat data_cost;
+            oclMat data_cost_selected;
+            oclMat temp;
+            oclMat out;
+        };
    }
 }
 #if defined _MSC_VER && _MSC_VER >= 1200

--- a/modules/ocl/src/brute_force_matcher.cpp
+++ b/modules/ocl/src/brute_force_matcher.cpp
@@ -65,12 +65,12 @@ namespace cv
 static const int OPT_SIZE = 100;

 static const char * T_ARR [] = {
-    "uchar", 
-    "char", 
-    "ushort", 
-    "short", 
-    "int", 
-    "float -D T_FLOAT", 
+    "uchar",
+    "char",
+    "ushort",
+    "short",
+    "int",
+    "float -D T_FLOAT",
    "double"};

 template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
@@ -86,8 +86,8 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        T_ARR[query.depth()], distType, block_size, m_size);

    if(globalSize[0] != 0)
@@ -128,8 +128,8 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        T_ARR[query.depth()], distType, block_size);
    if(globalSize[0] != 0)
    {
@@ -171,8 +171,8 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        T_ARR[query.depth()], distType, block_size, m_size);

    if(globalSize[0] != 0)
@@ -212,8 +212,8 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        T_ARR[query.depth()], distType, block_size);

    if(globalSize[0] != 0)
@@ -312,8 +312,8 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        T_ARR[query.depth()], distType, block_size, m_size);

    if(globalSize[0] != 0)
@@ -348,8 +348,8 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        T_ARR[query.depth()], distType, block_size);

    if(globalSize[0] != 0)
@@ -384,8 +384,8 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
        T_ARR[query.depth()], distType, block_size, m_size);

    if(globalSize[0] != 0)
@@ -420,8 +420,8 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask
    vector< pair<size_t, const void *> > args;

    char opt [OPT_SIZE] = "";
-    sprintf(opt, 
-        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d", 
+    sprintf(opt,
+        "-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
        T_ARR[query.depth()], distType, block_size);

    if(globalSize[0] != 0)
@@ -561,7 +561,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const
 {
    if (query.empty() || train.empty())
        return;
-    
+
    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.cols == query.cols && train.type() == query.type());

@@ -673,7 +673,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
        return;

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
-    
+
    const int nQuery = query.rows;

    ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
@@ -845,8 +845,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer
    if (query.empty() || trainCollection.empty())
        return;

-    typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
-                             const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
+    // typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
+    //                          const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);

@@ -993,7 +993,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector<

 // radiusMatchSingle
 void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
-        oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
+        oclMat &trainIdx,   oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
 {
    if (query.empty() || train.empty())
        return;
@@ -1095,9 +1095,9 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &qu
    if (query.empty() || empty())
        return;

+#if 0
    typedef void (*caller_t)(const oclMat & query, const oclMat * trains, int n, float maxDistance, const oclMat * masks,
                             const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance, const oclMat & nMatches);
-#if 0
    static const caller_t callers[3][6] =
    {
        {

--- a/modules/ocl/src/gemm.cpp
+++ b/modules/ocl/src/gemm.cpp
@@ -60,7 +60,7 @@ void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
                   const oclMat &src3, double beta, oclMat &dst, int flags)
 {
    CV_Assert(src1.cols == src2.rows &&
-              (src3.empty() || src1.rows == src3.rows && src2.cols == src3.cols));
+              (src3.empty() || (src1.rows == src3.rows && src2.cols == src3.cols)));
    CV_Assert(!(cv::GEMM_3_T & flags)); // cv::GEMM_3_T is not supported
    if(!src3.empty())
    {

--- a/modules/ocl/src/haar.cpp
+++ b/modules/ocl/src/haar.cpp
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -43,6 +43,7 @@
 //
 //M*/

+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
 #include "precomp.hpp"

 using namespace std;

--- a/modules/ocl/src/moments.cpp
+++ b/modules/ocl/src/moments.cpp
@@ -327,7 +327,7 @@ static void ocl_cvMoments( const void* array, CvMoments* mom, int binary )
    mom->m12 = dstsum[8];
    mom->m03 = dstsum[9];
    delete [] dstsum;
-
+    openCLSafeCall(clReleaseMemObject(sum));
    icvCompleteMomentState( mom );
 }


--- a/modules/ocl/src/opencl/filtering_boxFilter.cl
+++ b/modules/ocl/src/opencl/filtering_boxFilter.cl
--- a/modules/ocl/src/opencl/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
--- a/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -16,6 +16,7 @@
 //
 // @Authors
 //    Wu Xinglong, wxl370@126.com
+//    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -52,11 +53,11 @@ typedef struct  __attribute__((aligned(128)))  GpuHidHaarFeature
 {
    struct __attribute__((aligned(32)))
 {
-        int p0 __attribute__((aligned(4)));
-        int p1 __attribute__((aligned(4)));
-        int p2 __attribute__((aligned(4)));
-        int p3 __attribute__((aligned(4)));
-        float weight __attribute__((aligned(4)));
+    int p0 __attribute__((aligned(4)));
+    int p1 __attribute__((aligned(4)));
+    int p2 __attribute__((aligned(4)));
+    int p3 __attribute__((aligned(4)));
+    float weight __attribute__((aligned(4)));
 }
 rect[CV_HAAR_FEATURE_MAX] __attribute__((aligned(32)));
 }
@@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
    global const int *restrict sum,
    global const float   *restrict sqsum,
    global int4 *candidate,
+    const int rows,
+    const int cols,
    const int step,
    const int loopcount,
    const int start_stage,
    const int split_stage,
    const int end_stage,
    const int startnode,
-    const int splitnode,
    global int4 *p,
-    //const int4 * pq,
    global float *correction,
    const int nodecount)
 {
-        int grpszx = get_local_size(0);
-        int grpszy = get_local_size(1);
-        int grpnumx = get_num_groups(0);
-        int grpidx = get_group_id(0);
-        int lclidx = get_local_id(0);
-        int lclidy = get_local_id(1);
-        int lcl_sz = mul24(grpszx, grpszy);
-        int lcl_id = mad24(lclidy, grpszx, lclidx);
-        __local int lclshare[1024];
-        __local int *glboutindex = lclshare + 0;
-        __local int *lclcount = glboutindex + 1;
-        __local int *lcloutindex = lclcount + 1;
-        __local float *partialsum = (__local float *)(lcloutindex + (lcl_sz << 1));
-        glboutindex[0] = 0;
-        int outputoff = mul24(grpidx, 256);
-        candidate[outputoff + (lcl_id << 2)] = (int4)0;
-        candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
-        candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
-        candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
+    int grpszx = get_local_size(0);
+    int grpszy = get_local_size(1);
+    int grpnumx = get_num_groups(0);
+    int grpidx = get_group_id(0);
+    int lclidx = get_local_id(0);
+    int lclidy = get_local_id(1);
+    int lcl_sz = mul24(grpszx, grpszy);
+    int lcl_id = mad24(lclidy, grpszx, lclidx);
+    __local int glboutindex[1];
+    __local int lclcount[1];
+    __local int lcloutindex[64];
+    glboutindex[0] = 0;
+    int outputoff = mul24(grpidx, 256);
+    candidate[outputoff + (lcl_id << 2)] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
+    int max_idx = rows * cols - 1;
+    for (int scalei = 0; scalei < loopcount; scalei++)
+    {
+        int4 scaleinfo1;
+        scaleinfo1 = info[scalei];
+        int width = (scaleinfo1.x & 0xffff0000) >> 16;
+        int height = scaleinfo1.x & 0xffff;
+        int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
+        int totalgrp = scaleinfo1.y & 0xffff;
+        float factor = as_float(scaleinfo1.w);
+        float correction_t = correction[scalei];
+        int ystep = (int)(max(2.0f, factor) + 0.5f);

-        for (int scalei = 0; scalei < loopcount; scalei++)
+        for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
        {
-                int4 scaleinfo1;
-                scaleinfo1 = info[scalei];
-                int width = (scaleinfo1.x & 0xffff0000) >> 16;
-                int height = scaleinfo1.x & 0xffff;
-                int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
-                int totalgrp = scaleinfo1.y & 0xffff;
-                float factor = as_float(scaleinfo1.w);
-                float correction_t = correction[scalei];
-                int ystep = (int)(max(2.0f, factor) + 0.5f);
+            int4 cascadeinfo = p[scalei];
+            int grpidy = grploop / grpnumperline;
+            int grpidx = grploop - mul24(grpidy, grpnumperline);
+            int ix = mad24(grpidx, grpszx, lclidx);
+            int iy = mad24(grpidy, grpszy, lclidy);
+            int x = ix * ystep;
+            int y = iy * ystep;
+            lcloutindex[lcl_id] = 0;
+            lclcount[0] = 0;
+            int nodecounter;
+            float mean, variance_norm_factor;
+            //if((ix < width) && (iy < height))
+            {
+                const int p_offset = mad24(y, step, x);
+                cascadeinfo.x += p_offset;
+                cascadeinfo.z += p_offset;
+                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
+                       * correction_t;
+                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
+                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
+                variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
+                bool result = true;
+                nodecounter = startnode + nodecount * scalei;

-                for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
+                for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
                {
-                        int4 cascadeinfo = p[scalei];
-                        int grpidy = grploop / grpnumperline;
-                        int grpidx = grploop - mul24(grpidy, grpnumperline);
-                        int ix = mad24(grpidx, grpszx, lclidx);
-                        int iy = mad24(grpidy, grpszy, lclidy);
-                        int x = ix * ystep;
-                        int y = iy * ystep;
-                        lcloutindex[lcl_id] = 0;
-                        lclcount[0] = 0;
-                        int result = 1, nodecounter;
-                        float mean, variance_norm_factor;
-                        //if((ix < width) && (iy < height))
-                        {
-                                const int p_offset = mad24(y, step, x);
-                                cascadeinfo.x += p_offset;
-                                cascadeinfo.z += p_offset;
-                                mean = (sum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
-                                        sum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sum[mad24(cascadeinfo.w, step, cascadeinfo.z)])
-                                       * correction_t;
-                                variance_norm_factor = sqsum[mad24(cascadeinfo.y, step, cascadeinfo.x)] - sqsum[mad24(cascadeinfo.y, step, cascadeinfo.z)] -
-                                                       sqsum[mad24(cascadeinfo.w, step, cascadeinfo.x)] + sqsum[mad24(cascadeinfo.w, step, cascadeinfo.z)];
-                                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
-                                variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
-                                result = 1;
-                                nodecounter = startnode + nodecount * scalei;
-
-                                for (int stageloop = start_stage; stageloop < end_stage && result; stageloop++)
-                                {
-                                        float stage_sum = 0.f;
-                                        int4 stageinfo = *(global int4 *)(stagecascadeptr + stageloop);
-                                        float stagethreshold = as_float(stageinfo.y);
-
-                                        for (int nodeloop = 0; nodeloop < stageinfo.x; nodeloop++)
-                                        {
-                                                __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
-                                                int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
-                                                int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
-                                                int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
-                                                float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
-                                                float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
-                                                float nodethreshold  = w.w * variance_norm_factor;
-                                                info1.x += p_offset;
-                                                info1.z += p_offset;
-                                                info2.x += p_offset;
-                                                info2.z += p_offset;
-                                                float classsum = (sum[mad24(info1.y, step, info1.x)] - sum[mad24(info1.y, step, info1.z)] -
-                                                                  sum[mad24(info1.w, step, info1.x)] + sum[mad24(info1.w, step, info1.z)]) * w.x;
-                                                classsum += (sum[mad24(info2.y, step, info2.x)] - sum[mad24(info2.y, step, info2.z)] -
-                                                             sum[mad24(info2.w, step, info2.x)] + sum[mad24(info2.w, step, info2.z)]) * w.y;
-                                                info3.x += p_offset;
-                                                info3.z += p_offset;
-                                                classsum += (sum[mad24(info3.y, step, info3.x)] - sum[mad24(info3.y, step, info3.z)] -
-                                                             sum[mad24(info3.w, step, info3.x)] + sum[mad24(info3.w, step, info3.z)]) * w.z;
-                                                stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
-                                                nodecounter++;
-                                        }
-
-                                        result = (stage_sum >= stagethreshold);
-                                }
+                    float stage_sum = 0.f;
+                    int   stagecount = stagecascadeptr[stageloop].count;
+                    for (int nodeloop = 0; nodeloop < stagecount; nodeloop++)
+                    {
+                        __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
+                        int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
+                        int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
+                        int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
+                        float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
+                        float2 alpha2 = *(__global float2 *)(&(currentnodeptr->alpha[0]));
+                        float nodethreshold  = w.w * variance_norm_factor;
+                        info1.x += p_offset;
+                        info1.z += p_offset;
+                        info2.x += p_offset;
+                        info2.z += p_offset;
+                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
+                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
+                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
+                        info3.x += p_offset;
+                        info3.z += p_offset;
+                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
+                        stage_sum += classsum >= nodethreshold ? alpha2.y : alpha2.x;
+                        nodecounter++;
+                    }
+                    result = (bool)(stage_sum >= stagecascadeptr[stageloop].threshold);
+                }

-                                if (result && (ix < width) && (iy < height))
-                                {
-                                        int queueindex = atomic_inc(lclcount);
-                                        lcloutindex[queueindex << 1] = (y << 16) | x;
-                                        lcloutindex[(queueindex << 1) + 1] = as_int(variance_norm_factor);
-                                }
+                barrier(CLK_LOCAL_MEM_FENCE);

-                                barrier(CLK_LOCAL_MEM_FENCE);
-                                int queuecount = lclcount[0];
-                                nodecounter = splitnode + nodecount * scalei;
+                if (result && (ix < width) && (iy < height))
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex] = (y << 16) | x;
+                }

-                                if (lcl_id < queuecount)
-                                {
-                                        int temp = lcloutindex[lcl_id << 1];
-                                        int x = temp & 0xffff;
-                                        int y = (temp & (int)0xffff0000) >> 16;
-                                        temp = glboutindex[0];
-                                        int4 candidate_result;
-                                        candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
-                                        candidate_result.x = x;
-                                        candidate_result.y = y;
-                                        atomic_inc(glboutindex);
-                                        candidate[outputoff + temp + lcl_id] = candidate_result;
-                                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                int queuecount = lclcount[0];

-                                barrier(CLK_LOCAL_MEM_FENCE);
-                        }
+                if (lcl_id < queuecount)
+                {
+                    int temp = lcloutindex[lcl_id];
+                    int x = temp & 0xffff;
+                    int y = (temp & (int)0xffff0000) >> 16;
+                    temp = atomic_inc(glboutindex);
+                    int4 candidate_result;
+                    candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
+                    candidate_result.x = x;
+                    candidate_result.y = y;
+                    candidate[outputoff + temp + lcl_id] = candidate_result;
                }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
        }
+    }
 }
 __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
 {
-        int counter = get_global_id(0);
-        int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
-        GpuHidHaarTreeNode t1 = *(orinode + counter);
+    int counter = get_global_id(0);
+    int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
+    GpuHidHaarTreeNode t1 = *(orinode + counter);
 #pragma unroll

-        for (i = 0; i < 3; i++)
-        {
-                tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
-                tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
-                tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
-                tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
-        }
+    for (i = 0; i < 3; i++)
+    {
+        tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
+        tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
+        tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
+        tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
+    }

-        t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
-        counter += nodenum;
+    t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
+    counter += nodenum;
 #pragma unroll

-        for (i = 0; i < 3; i++)
-        {
-                newnode[counter].p[i][0] = tr_x[i];
-                newnode[counter].p[i][1] = tr_y[i];
-                newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
-                newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
-                newnode[counter].weight[i] = t1.weight[i] * weight_scale;
-        }
+    for (i = 0; i < 3; i++)
+    {
+        newnode[counter].p[i][0] = tr_x[i];
+        newnode[counter].p[i][1] = tr_y[i];
+        newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
+        newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
+        newnode[counter].weight[i] = t1.weight[i] * weight_scale;
+    }

-        newnode[counter].left = t1.left;
-        newnode[counter].right = t1.right;
-        newnode[counter].threshold = t1.threshold;
-        newnode[counter].alpha[0] = t1.alpha[0];
-        newnode[counter].alpha[1] = t1.alpha[1];
+    newnode[counter].left = t1.left;
+    newnode[counter].right = t1.right;
+    newnode[counter].threshold = t1.threshold;
+    newnode[counter].alpha[0] = t1.alpha[0];
+    newnode[counter].alpha[1] = t1.alpha[1];
 }
+
--- a/modules/ocl/src/opencl/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
 #if defined (DOUBLE_SUPPORT)

 #ifdef cl_khr_fp64
@@ -609,22 +654,33 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
    int y = wgidy*TILE_SIZE;  // real Y index of pixel
    int x = wgidx*TILE_SIZE;  // real X index of pixel
    int kcn = (cn==2)?2:4;
-    int rstep = min(src_step/4, TILE_SIZE);
+    src_step /= sizeof(*src_data);
+    int rstep = min(src_step, TILE_SIZE);
    tileSize_height = min(TILE_SIZE, src_rows - y);
    tileSize_width = min(TILE_SIZE, src_cols -x);
-    if(tileSize_width < TILE_SIZE)
-        for(int i = tileSize_width; i < rstep; i++ )
-            *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+    int maxIdx = mul24(src_rows, src_cols);
+    int yOff = (y+lidy)*src_step;
+    int index;
+    if(tileSize_width < TILE_SIZE && yOff < src_rows)
+        for(int i = tileSize_width; i < rstep && (yOff+x+i) < maxIdx; i++ )
+            *(src_data+yOff+x+i) = 0;
    if( coi > 0 )
        for(int i=0; i < tileSize_width; i+=VLEN_F)
        {
+#pragma unroll
            for(int j=0; j<4; j++)
-                tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+            {
+                index = yOff+(x+i+j)*kcn+coi-1;
+                if (index < maxIdx)
+                    tmp_coi[j] = *(src_data+index);
+                else
+                    tmp_coi[j] = 0;
+            }
            tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
        }
    else
-        for(int i=0; i < tileSize_width; i+=VLEN_F)
-            tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+        for(int i=0; i < tileSize_width && (yOff+x+i) < maxIdx; i+=VLEN_F)
+            tmp[i/VLEN_F] = (*(__global float4 *)(src_data+yOff+x+i));
    float4 zero = (float4)(0);
    float4 full = (float4)(255);
    if( binary )
@@ -714,35 +770,59 @@ __kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols
        // accumulate moments computed in each tile
        dst_step /= sizeof(F);

+        int dst_x_off = mad24(wgidy, dst_cols, wgidx);
+        int dst_off = 0;
+        int max_dst_index = 10 * blocky * get_global_size(1);
+
        // + m00 ( = m00' )
-        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+        dst_off = mad24(DST_ROW_00 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[0];

        // + m10 ( = m10' + x*m00' )
-        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+        dst_off = mad24(DST_ROW_10 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[1] + xm;

        // + m01 ( = m01' + y*m00' )
-        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+        dst_off = mad24(DST_ROW_01 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[2] + ym;

        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
-        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+        dst_off = mad24(DST_ROW_20 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[3] + x * (mom[1] * 2 + xm);

        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
-        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+        dst_off = mad24(DST_ROW_11 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[4] + x * (mom[2] + ym) + y * mom[1];

        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
-        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+        dst_off = mad24(DST_ROW_02 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[5] + y * (mom[2] * 2 + ym);

        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
-        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+        dst_off = mad24(DST_ROW_30 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));

        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
-        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+        dst_off = mad24(DST_ROW_21 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];

        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
-        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+        dst_off = mad24(DST_ROW_12 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];

        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
-        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+        dst_off = mad24(DST_ROW_03 * blocky, dst_step, dst_x_off);
+        if (dst_off < max_dst_index)
+            *(dst_m + dst_off) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
    }
 }


--- a/modules/ocl/src/opencl/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@@ -16,6 +16,8 @@
 //
 // @Authors
 //    Jia Haipeng, jiahaipeng95@gmail.com
+//    Sen Liu, swjtuls1987@126.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -50,59 +52,40 @@
 #define STEREO_MIND 0                    // The minimum d range to check
 #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing

-int SQ(int a)
-{
-    return a * a;
-}
+#ifndef radius
+#define radius 64
+#endif

-unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
-                     volatile __local unsigned int *col_ssd, int radius)
+unsigned int CalcSSD(__local unsigned int *col_ssd)
 {
-    unsigned int cache = 0;
-    unsigned int cache2 = 0;
+    unsigned int cache = col_ssd[0];

-    for(int i = 1; i <= radius; i++)
+#pragma unroll
+    for(int i = 1; i <= (radius << 1); i++)
        cache += col_ssd[i];

-    col_ssd_cache[0] = cache;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(0) < BLOCK_W - radius)
-        cache2 = col_ssd_cache[radius];
-    else
-        for(int i = radius + 1; i < (2 * radius + 1); i++)
-            cache2 += col_ssd[i];
-
-    return col_ssd[0] + cache + cache2;
+    return cache;
 }

-uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
-             volatile __local unsigned int *col_ssd, int radius)
+uint2 MinSSD(__local unsigned int *col_ssd)
 {
    unsigned int ssd[N_DISPARITIES];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
-    ssd[0] = CalcSSD(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[1] = CalcSSD(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[2] = CalcSSD(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[3] = CalcSSD(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[4] = CalcSSD(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[5] = CalcSSD(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[6] = CalcSSD(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
-    ssd[7] = CalcSSD(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * radius), radius);
-    barrier(CLK_LOCAL_MEM_FENCE);
+    const int win_size = (radius << 1);
+
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
+    ssd[0] = CalcSSD(col_ssd + 0 * (BLOCK_W + win_size));
+    ssd[1] = CalcSSD(col_ssd + 1 * (BLOCK_W + win_size));
+    ssd[2] = CalcSSD(col_ssd + 2 * (BLOCK_W + win_size));
+    ssd[3] = CalcSSD(col_ssd + 3 * (BLOCK_W + win_size));
+    ssd[4] = CalcSSD(col_ssd + 4 * (BLOCK_W + win_size));
+    ssd[5] = CalcSSD(col_ssd + 5 * (BLOCK_W + win_size));
+    ssd[6] = CalcSSD(col_ssd + 6 * (BLOCK_W + win_size));
+    ssd[7] = CalcSSD(col_ssd + 7 * (BLOCK_W + win_size));

    unsigned int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));

    int bestIdx = 0;
+
    for (int i = 0; i < N_DISPARITIES; i++)
    {
        if (mssd == ssd[i])
@@ -113,124 +96,66 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
 }

 void StepDown(int idx1, int idx2, __global unsigned char* imageL,
-              __global unsigned char* imageR, int d, volatile  __local unsigned int *col_ssd, int radius)
+              __global unsigned char* imageR, int d,   __local unsigned int *col_ssd)
 {
-    unsigned char leftPixel1;
-    unsigned char leftPixel2;
-    unsigned char rightPixel1[8];
-    unsigned char rightPixel2[8];
-    unsigned int diff1, diff2;
-
-    leftPixel1 = imageL[idx1];
-    leftPixel2 = imageL[idx2];
-
-    idx1 = idx1 - d;
-    idx2 = idx2 - d;
-
-    rightPixel1[7] = imageR[idx1 - 7];
-    rightPixel1[0] = imageR[idx1 - 0];
-    rightPixel1[1] = imageR[idx1 - 1];
-    rightPixel1[2] = imageR[idx1 - 2];
-    rightPixel1[3] = imageR[idx1 - 3];
-    rightPixel1[4] = imageR[idx1 - 4];
-    rightPixel1[5] = imageR[idx1 - 5];
-    rightPixel1[6] = imageR[idx1 - 6];
-
-    rightPixel2[7] = imageR[idx2 - 7];
-    rightPixel2[0] = imageR[idx2 - 0];
-    rightPixel2[1] = imageR[idx2 - 1];
-    rightPixel2[2] = imageR[idx2 - 2];
-    rightPixel2[3] = imageR[idx2 - 3];
-    rightPixel2[4] = imageR[idx2 - 4];
-    rightPixel2[5] = imageR[idx2 - 5];
-    rightPixel2[6] = imageR[idx2 - 6];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
-    diff1 = leftPixel1 - rightPixel1[0];
-    diff2 = leftPixel2 - rightPixel2[0];
-    col_ssd[0 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[1];
-    diff2 = leftPixel2 - rightPixel2[1];
-    col_ssd[1 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[2];
-    diff2 = leftPixel2 - rightPixel2[2];
-    col_ssd[2 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[3];
-    diff2 = leftPixel2 - rightPixel2[3];
-    col_ssd[3 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[4];
-    diff2 = leftPixel2 - rightPixel2[4];
-    col_ssd[4 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[5];
-    diff2 = leftPixel2 - rightPixel2[5];
-    col_ssd[5 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[6];
-    diff2 = leftPixel2 - rightPixel2[6];
-    col_ssd[6 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[7];
-    diff2 = leftPixel2 - rightPixel2[7];
-    col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
+    uint8 imgR1 = convert_uint8(vload8(0, imageR + (idx1 - d - 7)));
+    uint8 imgR2 = convert_uint8(vload8(0, imageR + (idx2 - d - 7)));
+    uint8 diff1 = (uint8)(imageL[idx1]) - imgR1;
+    uint8 diff2 = (uint8)(imageL[idx2]) - imgR2;
+    uint8 res = diff2 * diff2 - diff1 * diff1;
+    const int win_size = (radius << 1);
+    col_ssd[0 * (BLOCK_W + win_size)] += res.s7;
+    col_ssd[1 * (BLOCK_W + win_size)] += res.s6;
+    col_ssd[2 * (BLOCK_W + win_size)] += res.s5;
+    col_ssd[3 * (BLOCK_W + win_size)] += res.s4;
+    col_ssd[4 * (BLOCK_W + win_size)] += res.s3;
+    col_ssd[5 * (BLOCK_W + win_size)] += res.s2;
+    col_ssd[6 * (BLOCK_W + win_size)] += res.s1;
+    col_ssd[7 * (BLOCK_W + win_size)] += res.s0;
 }

 void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
                __global unsigned char* imageR, int d,
-                volatile __local unsigned int *col_ssd, int radius)
+                 __local unsigned int *col_ssd)
 {
-    unsigned char leftPixel1;
-    int idx;
-    unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-    for(int i = 0; i < (2 * radius + 1); i++)
+    uint8 leftPixel1;
+    uint8 diffa = 0;
+    int idx = y_tex * im_pitch + x_tex;
+    const int win_size = (radius << 1);
+    for(int i = 0; i < (win_size + 1); i++)
    {
-        idx = y_tex * im_pitch + x_tex;
-        leftPixel1 = imageL[idx];
-        idx = idx - d;
-
-        diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
-        diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
-        diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
-        diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
-        diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
-        diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
-        diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
-        diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
-
-        y_tex += 1;
+        leftPixel1 = (uint8)(imageL[idx]);
+        uint8 imgR = convert_uint8(vload8(0, imageR + (idx - d - 7)));
+        uint8 res = leftPixel1 - imgR;
+        diffa += res * res;
+
+        idx += im_pitch;
    }
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * radius)
-    col_ssd[0 * (BLOCK_W + 2 * radius)] = diffa[0];
-    col_ssd[1 * (BLOCK_W + 2 * radius)] = diffa[1];
-    col_ssd[2 * (BLOCK_W + 2 * radius)] = diffa[2];
-    col_ssd[3 * (BLOCK_W + 2 * radius)] = diffa[3];
-    col_ssd[4 * (BLOCK_W + 2 * radius)] = diffa[4];
-    col_ssd[5 * (BLOCK_W + 2 * radius)] = diffa[5];
-    col_ssd[6 * (BLOCK_W + 2 * radius)] = diffa[6];
-    col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
+    //See above:  #define COL_SSD_SIZE (BLOCK_W + WIN_SIZE)
+    col_ssd[0 * (BLOCK_W + win_size)] = diffa.s7;
+    col_ssd[1 * (BLOCK_W + win_size)] = diffa.s6;
+    col_ssd[2 * (BLOCK_W + win_size)] = diffa.s5;
+    col_ssd[3 * (BLOCK_W + win_size)] = diffa.s4;
+    col_ssd[4 * (BLOCK_W + win_size)] = diffa.s3;
+    col_ssd[5 * (BLOCK_W + win_size)] = diffa.s2;
+    col_ssd[6 * (BLOCK_W + win_size)] = diffa.s1;
+    col_ssd[7 * (BLOCK_W + win_size)] = diffa.s0;
 }

 __kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
                           __global unsigned int *cminSSDImage, int cminSSD_step,
                           __global unsigned char *disp, int disp_step,int cwidth, int cheight,
-                           int img_step, int maxdisp, int radius,
+                           int img_step, int maxdisp,
                           __local unsigned int *col_ssd_cache)
 {
-
-    volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
-    volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
+    __local unsigned int *col_ssd = col_ssd_cache + get_local_id(0);
+    __local unsigned int *col_ssd_extra = get_local_id(0) < (radius << 1) ? col_ssd + BLOCK_W : 0;

    int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
-    // int Y = get_group_id(1) * ROWSperTHREAD + radius;

 #define Y (get_group_id(1) * ROWSperTHREAD + radius)

-    volatile __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+    __global unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
    __global unsigned char* disparImage = disp + X + Y * disp_step;

    int end_row = ROWSperTHREAD < (cheight - Y) ? ROWSperTHREAD:(cheight - Y);
@@ -244,14 +169,14 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
    {
        y_tex = Y - radius;

-        InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd, radius);
+        InitColSSD(x_tex, y_tex, img_step, left, right, d, col_ssd);
        if (col_ssd_extra > 0)
            if (x_tex + BLOCK_W < cwidth)
-                InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra, radius);
+                InitColSSD(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);

        barrier(CLK_LOCAL_MEM_FENCE); //before MinSSD function

-        uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+        uint2 minSSD = MinSSD(col_ssd);
        if (X < cwidth - radius && Y < cheight - radius)
        {
            if (minSSD.x < minSSDImage[0])
@@ -264,21 +189,18 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
        for(int row = 1; row < end_row; row++)
        {
            int idx1 = y_tex * img_step + x_tex;
-            int idx2 = min(y_tex + (2 * radius + 1), cheight - 1) * img_step + x_tex;
-
-            barrier(CLK_GLOBAL_MEM_FENCE);
+            int idx2 = min(y_tex + ((radius << 1) + 1), cheight - 1) * img_step + x_tex;
+            
            barrier(CLK_LOCAL_MEM_FENCE);

-            StepDown(idx1, idx2, left, right, d, col_ssd, radius);
+            StepDown(idx1, idx2, left, right, d, col_ssd);
            if (col_ssd_extra > 0)
                if (x_tex + BLOCK_W < cwidth)
-                    StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra, radius);
-
-            y_tex += 1;
+                    StepDown(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);

            barrier(CLK_LOCAL_MEM_FENCE);

-            uint2 minSSD = MinSSD(col_ssd_cache + get_local_id(0), col_ssd, radius);
+            uint2 minSSD = MinSSD(col_ssd);
            if (X < cwidth - radius && row < cheight - radius - Y)
            {
                int idx = row * cminSSD_step;
@@ -288,10 +210,11 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
                    minSSDImage[idx] = minSSD.x;
                }
            }
+
+            y_tex++;
        } // for row loop
    } // for d loop
 }
-
 //////////////////////////////////////////////////////////////////////////////////////////////////
 //////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////

--- a/modules/ocl/src/opencl/stereocsbp.cl
+++ b/modules/ocl/src/opencl/stereocsbp.cl
--- a/modules/ocl/src/stereo_csbp.cpp
+++ b/modules/ocl/src/stereo_csbp.cpp
--- a/modules/ocl/src/stereobm.cpp
+++ b/modules/ocl/src/stereobm.cpp
@@ -74,28 +74,21 @@ namespace stereoBM
 ////////////////////////////////////////////////////////////////////////
 static void prefilter_xsobel(const oclMat &input, oclMat &output, int prefilterCap)
 {
-    Context *clCxt = input.clCxt;
-
    string kernelName = "prefilter_xsobel";
-    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);

    size_t blockSize = 1;
    size_t globalThreads[3] = { input.cols, input.rows, 1 };
    size_t localThreads[3]  = { blockSize, blockSize, 1 };

-    openCLVerifyKernel(clCxt, kernel,  localThreads);
-    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input.data));
-    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output.data));
-    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&input.rows));
-    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&input.cols));
-    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&prefilterCap));
-
-    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 3, NULL,
-                                          globalThreads, localThreads, 0, NULL, NULL));
-
-    clFinish((cl_command_queue)clCxt->oclCommandQueue());
-    openCLSafeCall(clReleaseKernel(kernel));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&input.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&prefilterCap));

+    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
+        globalThreads, localThreads, args, -1, -1);
 }
 //////////////////////////////////////////////////////////////////////////
 //////////////////////////////common////////////////////////////////////
@@ -115,19 +108,13 @@ static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
 {
    int winsz2 = winSize >> 1;

-    //if(winsz2 == 0 || winsz2 >= calles_num)
-    //cv::ocl:error("Unsupported window size", __FILE__, __LINE__, __FUNCTION__);
-
-    Context *clCxt = left.clCxt;
-
    string kernelName = "stereoKernel";
-    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);

    disp.setTo(Scalar_<unsigned char>::all(0));
    minSSD_buf.setTo(Scalar_<unsigned int>::all(0xFFFFFFFF));

    size_t minssd_step = minSSD_buf.step / minSSD_buf.elemSize();
-    size_t local_mem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
+    size_t local_mem_size = (N_DISPARITIES * (BLOCK_W + 2 * winsz2)) *
                            sizeof(cl_uint);
    //size_t blockSize = 1;
    size_t localThreads[]  = { BLOCK_W, 1,1};
@@ -136,26 +123,23 @@ static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
                               1
                             };

-    openCLVerifyKernel(clCxt, kernel, localThreads);
-    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&left.data));
-    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&right.data));
-    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&minSSD_buf.data));
-    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&minssd_step));
-    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&disp.data));
-    openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&disp.step));
-    openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
-    openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.rows));
-    openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&left.step));
-    openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&maxdisp));
-    openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&winsz2));
-    openCLSafeCall(clSetKernelArg(kernel, 11, local_mem_size, (void *)NULL));
-
-    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
-                                          globalThreads, localThreads, 0, NULL, NULL));
-
-
-    clFinish((cl_command_queue)clCxt->oclCommandQueue());
-    openCLSafeCall(clReleaseKernel(kernel));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&right.data));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&minSSD_buf.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&minssd_step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disp.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disp.step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.step));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&maxdisp));
+    args.push_back(std::make_pair(local_mem_size, (void *)NULL));
+
+    char opt [128];
+    sprintf(opt, "-D radius=%d", winsz2);
+    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
+        globalThreads, localThreads, args, -1, -1, opt);
 }
 ////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////postfilter_textureness///////////////////////
@@ -163,10 +147,7 @@ static void stereo_bm(const oclMat &left, const oclMat &right,  oclMat &disp,
 static void postfilter_textureness(oclMat &left, int winSize,
                            float avergeTexThreshold, oclMat &disparity)
 {
-    Context *clCxt = left.clCxt;
-
    string kernelName = "textureness_kernel";
-    cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereobm, kernelName);

    size_t blockSize = 1;
    size_t localThreads[]  = { BLOCK_W, blockSize ,1};
@@ -177,22 +158,19 @@ static void postfilter_textureness(oclMat &left, int winSize,

    size_t local_mem_size = (localThreads[0] + localThreads[0] + (winSize / 2) * 2) * sizeof(float);

-    openCLVerifyKernel(clCxt, kernel,  localThreads);
-    openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disparity.data));
-    openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&disparity.rows));
-    openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&disparity.cols));
-    openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&disparity.step));
-    openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&left.data));
-    openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows));
-    openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols));
-    openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&winSize));
-    openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&avergeTexThreshold));
-    openCLSafeCall(clSetKernelArg(kernel, 9, local_mem_size, NULL));
-    openCLSafeCall(clEnqueueNDRangeKernel((cl_command_queue)clCxt->oclCommandQueue(), kernel, 2, NULL,
-                                          globalThreads, localThreads, 0, NULL, NULL));
-
-    clFinish((cl_command_queue)clCxt->oclCommandQueue());
-    openCLSafeCall(clReleaseKernel(kernel));
+    std::vector< std::pair<size_t, const void *> > args;
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&disparity.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&disparity.step));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&left.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.rows));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&left.cols));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&winSize));
+    args.push_back(std::make_pair(sizeof(cl_float), (void *)&avergeTexThreshold));
+    args.push_back(std::make_pair(local_mem_size, (void*)NULL));
+    openCLExecuteKernel(Context::getContext(), &stereobm, kernelName,
+        globalThreads, localThreads, args, -1, -1);
 }
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////operator/////////////////////////////////

--- a/modules/ocl/test/interpolation.hpp
+++ b/modules/ocl/test/interpolation.hpp
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                        Intel License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of Intel Corporation may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_TEST_INTERPOLATION_HPP__
-#define __OPENCV_TEST_INTERPOLATION_HPP__
-
-template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-{
-    if (border_type == cv::BORDER_CONSTANT)
-        return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]);
-
-    return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c);
-}
-
-template <typename T> struct NearestInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal);
-    }
-};
-
-template <typename T> struct LinearInterpolator
-{
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        x -= 0.5f;
-        y -= 0.5f;
-
-        int x1 = cvFloor(x);
-        int y1 = cvFloor(y);
-        int x2 = x1 + 1;
-        int y2 = y1 + 1;
-
-        float res = 0;
-
-        res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y));
-        res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y));
-        res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1));
-        res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1));
-
-        return cv::saturate_cast<T>(res);
-    }
-};
-
-template <typename T> struct CubicInterpolator
-{
-    static float getValue(float p[4], float x)
-    {
-        return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
-    }
-
-    static float getValue(float p[4][4], float x, float y)
-    {
-        float arr[4];
-
-        arr[0] = getValue(p[0], x);
-        arr[1] = getValue(p[1], x);
-        arr[2] = getValue(p[2], x);
-        arr[3] = getValue(p[3], x);
-
-        return getValue(arr, y);
-    }
-
-    static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar())
-    {
-        int ix = cvRound(x);
-        int iy = cvRound(y);
-
-        float vals[4][4] =
-        {
-            {readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy    , ix - 2, c, border_type, borderVal), readVal<T>(src, iy    , ix - 1, c, border_type, borderVal), readVal<T>(src, iy    , ix, c, border_type, borderVal), readVal<T>(src, iy    , ix + 1, c, border_type, borderVal)},
-            {readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)},
-        };
-
-        return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0));
-    }
-};
-
-#endif // __OPENCV_TEST_INTERPOLATION_HPP__
--- a/modules/ocl/test/precomp.hpp
+++ b/modules/ocl/test/precomp.hpp
@@ -71,7 +71,6 @@
 #include "opencv2/ocl/ocl.hpp"

 #include "utility.hpp"
-#include "interpolation.hpp"
 //#include "add_test_info.h"

 #endif

--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
--- a/modules/ocl/test/test_blend.cpp
+++ b/modules/ocl/test/test_blend.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
 #include "precomp.hpp"
 #include <iomanip>

@@ -33,20 +77,14 @@ void blendLinearGold(const cv::Mat &img1, const cv::Mat &img2, const cv::Mat &we

 PARAM_TEST_CASE(Blend, cv::Size, MatType/*, UseRoi*/)
 {
-    //std::vector<cv::ocl::Info> oclinfo;
    cv::Size size;
    int type;
    bool useRoi;

    virtual void SetUp()
    {
-        //devInfo = GET_PARAM(0);
        size = GET_PARAM(0);
        type = GET_PARAM(1);
-        /*useRoi = GET_PARAM(3);*/
-
-        //int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
-        //CV_Assert(devnums > 0);
    }
 };

@@ -59,12 +97,9 @@ TEST_P(Blend, Accuracy)
    cv::Mat weights1 = randomMat(size, CV_32F, 0, 1);
    cv::Mat weights2 = randomMat(size, CV_32F, 0, 1);

-    cv::ocl::oclMat gimg1(size, type), gimg2(size, type), gweights1(size, CV_32F), gweights2(size, CV_32F);
-    cv::ocl::oclMat dst(size, type);
-    gimg1.upload(img1);
-    gimg2.upload(img2);
-    gweights1.upload(weights1);
-    gweights2.upload(weights2);
+    cv::ocl::oclMat gimg1(img1), gimg2(img2), gweights1(weights1), gweights2(weights2);
+    cv::ocl::oclMat dst;
+
    cv::ocl::blendLinear(gimg1, gimg2, gweights1, gweights2, dst);
    cv::Mat result;
    cv::Mat result_gold;
@@ -74,10 +109,10 @@ TEST_P(Blend, Accuracy)
    else
        blendLinearGold<float>(img1, img2, weights1, weights2, result_gold);

-    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f, 0);
+    EXPECT_MAT_NEAR(result_gold, result, CV_MAT_DEPTH(type) == CV_8U ? 1.f : 1e-5f);
 }

-INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Blend, Combine(
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Blend, Combine(
                            DIFFERENT_SIZES,
                            testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4))
                        ));

--- a/modules/ocl/test/test_brute_force_matcher.cpp
+++ b/modules/ocl/test/test_brute_force_matcher.cpp
@@ -7,12 +7,16 @@
 //  copy or use the software.
 //
 //
-//                        Intel License Agreement
+//                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2010-2012, Multicoreware inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
+// @Authors
+//    Nathan, liujun@multicorewareinc.com
+//
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
@@ -21,12 +25,12 @@
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
+//     and/or other oclMaterials provided with the distribution.
 //
-//   * The name of Intel Corporation may not be used to endorse or promote products
+//   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
-// This software is provided by the copyright holders and contributors "as is" and
+// This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,

--- a/modules/ocl/test/test_calib3d.cpp
+++ b/modules/ocl/test/test_calib3d.cpp
--- a/modules/ocl/test/test_color.cpp
+++ b/modules/ocl/test/test_color.cpp
--- a/modules/ocl/test/test_columnsum.cpp
+++ b/modules/ocl/test/test_columnsum.cpp
--- a/modules/ocl/test/test_fft.cpp
+++ b/modules/ocl/test/test_fft.cpp
--- a/modules/ocl/test/test_filters.cpp
+++ b/modules/ocl/test/test_filters.cpp
--- a/modules/ocl/test/test_gemm.cpp
+++ b/modules/ocl/test/test_gemm.cpp
@@ -53,13 +53,12 @@ PARAM_TEST_CASE(Gemm, int, cv::Size, int)
    int      type;
    cv::Size mat_size;
    int		 flags;
-    //vector<cv::ocl::Info> info;
+
    virtual void SetUp()
    {
        type     = GET_PARAM(0);
        mat_size = GET_PARAM(1);
        flags    = GET_PARAM(2);
-        //cv::ocl::getDevice(info);
    }
 };


--- a/modules/ocl/test/test_haar.cpp
+++ b/modules/ocl/test/test_haar.cpp
--- a/modules/ocl/test/test_hog.cpp
+++ b/modules/ocl/test/test_hog.cpp
--- a/modules/ocl/test/test_imgproc.cpp
+++ b/modules/ocl/test/test_imgproc.cpp
--- a/modules/ocl/test/test_match_template.cpp
+++ b/modules/ocl/test/test_match_template.cpp
--- a/modules/ocl/test/test_matrix_operation.cpp
+++ b/modules/ocl/test/test_matrix_operation.cpp
--- a/modules/ocl/test/test_moments.cpp
+++ b/modules/ocl/test/test_moments.cpp
--- a/modules/ocl/test/test_pyrdown.cpp
+++ b/modules/ocl/test/test_pyrdown.cpp
--- a/modules/ocl/test/test_pyrlk.cpp
+++ b/modules/ocl/test/test_pyrlk.cpp
--- a/modules/ocl/test/test_pyrup.cpp
+++ b/modules/ocl/test/test_pyrup.cpp
--- a/modules/ocl/test/test_split_merge.cpp
+++ b/modules/ocl/test/test_split_merge.cpp
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
--- a/modules/ts/include/opencv2/ts/ts.hpp
+++ b/modules/ts/include/opencv2/ts/ts.hpp
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
--- a/modules/ts/src/ts_func.cpp
+++ b/modules/ts/src/ts_func.cpp
--- a/samples/android/CMakeLists.txt
+++ b/samples/android/CMakeLists.txt