updated patch to bring in the first functions with "transparent API"

d914f20a · Vadim Pisarevsky · bb4bf7a1 · d914f20a · d914f20a · d914f20a
64 changed file
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -501,9 +501,10 @@ macro(ocv_glob_module_sources)
  file(GLOB cl_kernels "src/opencl/*.cl")
  if(HAVE_opencv_ocl AND cl_kernels)
    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    string(REGEX REPLACE "opencv_" "" the_module_barename "${the_module}")
    add_custom_command(
      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
-      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      COMMAND ${CMAKE_COMMAND} -DMODULE_NAME="${the_module_barename}" -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")

--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@@ -4,6 +4,15 @@ list(SORT cl_list)
 string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
 get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)

+if("${MODULE_NAME}" STREQUAL "ocl")
+    set(nested_namespace_start "")
+    set(nested_namespace_end "")
+else()
+    set(new_mode ON)
+    set(nested_namespace_start "namespace ${MODULE_NAME}\n{")
+    set(nested_namespace_end "}") 
+endif()
+
 set(STR_CPP "// This file is auto-generated. Do not edit!

 #include \"precomp.hpp\"
@@ -13,16 +22,19 @@ namespace cv
 {
 namespace ocl
 {
+${nested_namespace_start}
+
 ")

 set(STR_HPP "// This file is auto-generated. Do not edit!

-#include \"opencv2/ocl/private/util.hpp\"
+#include \"opencv2/core/ocl_genbase.hpp\"

 namespace cv
 {
 namespace ocl
 {
+${nested_namespace_start}

 ")

@@ -49,12 +61,19 @@ foreach(cl ${cl_list})

  string(MD5 hash "${lines}")

-  set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
-  set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n")
+  set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
+  set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n")
+  if(new_mode)
+    set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
+    set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n")
+  endif()
+
+  set(STR_CPP "${STR_CPP}${STR_CPP_DECL}")
+  set(STR_HPP "${STR_HPP}${STR_HPP_DECL}")
 endforeach()

-set(STR_CPP "${STR_CPP}}\n}\n")
-set(STR_HPP "${STR_HPP}}\n}\n")
+set(STR_CPP "${STR_CPP}}\n${nested_namespace_end}}\n")
+set(STR_HPP "${STR_HPP}}\n${nested_namespace_end}}\n")

 file(WRITE "${OUTPUT}" "${STR_CPP}")


--- a/modules/bioinspired/src/precomp.hpp
+++ b/modules/bioinspired/src/precomp.hpp
@@ -47,6 +47,7 @@
 #include "opencv2/bioinspired.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"

 #include <valarray>


--- a/modules/bioinspired/src/retina_ocl.cpp
+++ b/modules/bioinspired/src/retina_ocl.cpp
@@ -56,6 +56,8 @@

 namespace cv
 {
+static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel;
+
 namespace bioinspired
 {
 namespace ocl

--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
 CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
 //! computes per-element maximum of two arrays (dst = max(src1, src2))
 CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);

 //! computes square root of each matrix element (dst = src**0.5)
 CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);

--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -58,6 +58,8 @@ namespace cv
 enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };

+class CV_EXPORTS _OutputArray;
+
 //////////////////////// Input/Output Array Arguments /////////////////////////////////

 /*!
@@ -116,12 +118,22 @@ public:
    void* getObj() const;

    virtual int kind() const;
+    virtual int dims(int i=-1) const;
    virtual Size size(int i=-1) const;
+    virtual int sizend(int* sz, int i=-1) const;
+    virtual bool sameSize(const _InputArray& arr) const;
    virtual size_t total(int i=-1) const;
    virtual int type(int i=-1) const;
    virtual int depth(int i=-1) const;
    virtual int channels(int i=-1) const;
+    virtual bool isContinuous(int i=-1) const;
    virtual bool empty() const;
+    virtual void copyTo(const _OutputArray& arr) const;
+    bool isMat() const;
+    bool isUMat() const;
+    bool isMatVectot() const;
+    bool isUMatVector() const;
+    bool isMatx();

    virtual ~_InputArray();

@@ -197,8 +209,10 @@ public:
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    virtual void createSameSize(const _InputArray& arr, int mtype) const;
    virtual void release() const;
    virtual void clear() const;
+    virtual void setTo(const _InputArray& value) const;
 };



--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)

 inline _InputArray::~_InputArray() {}

+inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
+inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
+inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
+inline bool _InputArray::isMatx()  { return kind() == _InputArray::MATX; }
+
 ////////////////////////////////////////////////////////////////////////////////////////

 inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }

--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -49,13 +49,13 @@ namespace cv { namespace ocl {
 CV_EXPORTS bool haveOpenCL();
 CV_EXPORTS bool useOpenCL();
 CV_EXPORTS void setUseOpenCL(bool flag);
-CV_EXPORTS void finish();
+CV_EXPORTS void finish2();

-class CV_EXPORTS Context;
+class CV_EXPORTS Context2;
 class CV_EXPORTS Device;
 class CV_EXPORTS Kernel;
 class CV_EXPORTS Program;
-class CV_EXPORTS ProgramSource;
+class CV_EXPORTS ProgramSource2;
 class CV_EXPORTS Queue;

 class CV_EXPORTS Device
@@ -199,22 +199,22 @@ protected:
 };


-class CV_EXPORTS Context
+class CV_EXPORTS Context2
 {
 public:
-    Context();
-    explicit Context(int dtype);
-    ~Context();
-    Context(const Context& c);
-    Context& operator = (const Context& c);
+    Context2();
+    explicit Context2(int dtype);
+    ~Context2();
+    Context2(const Context2& c);
+    Context2& operator = (const Context2& c);

    bool create(int dtype);
    size_t ndevices() const;
    const Device& device(size_t idx) const;
-    Program getProg(const ProgramSource& prog,
+    Program getProg(const ProgramSource2& prog,
                    const String& buildopt, String& errmsg);

-    static Context& getDefault();
+    static Context2& getDefault();
    void* ptr() const;
 protected:
    struct Impl;
@@ -226,12 +226,12 @@ class CV_EXPORTS Queue
 {
 public:
    Queue();
-    explicit Queue(const Context& c, const Device& d=Device());
+    explicit Queue(const Context2& c, const Device& d=Device());
    ~Queue();
    Queue(const Queue& q);
    Queue& operator = (const Queue& q);

-    bool create(const Context& c=Context(), const Device& d=Device());
+    bool create(const Context2& c=Context2(), const Device& d=Device());
    void finish();
    void* ptr() const;
    static Queue& getDefault();
@@ -245,41 +245,55 @@ protected:
 class CV_EXPORTS KernelArg
 {
 public:
-    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 };
-    KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, NO_SIZE=256 };
+    KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
+    KernelArg();

    static KernelArg Local() { return KernelArg(LOCAL, 0); }
-    static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
-    static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
+    static KernelArg ReadWrite(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
+    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg ReadOnly(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
+    static KernelArg WriteOnly(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
+    static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
    static KernelArg Constant(const Mat& m);
    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
-    { return KernelArg(CONSTANT, 0, (void*)arr, n); }
+    { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }

    int flags;
    UMat* m;
-    void* obj;
+    const void* obj;
    size_t sz;
+    int wscale;
 };

+
 class CV_EXPORTS Kernel
 {
 public:
    Kernel();
    Kernel(const char* kname, const Program& prog);
-    Kernel(const char* kname, const ProgramSource& prog,
-           const String& buildopts, String& errmsg);
+    Kernel(const char* kname, const ProgramSource2& prog,
+           const String& buildopts, String* errmsg=0);
    ~Kernel();
    Kernel(const Kernel& k);
    Kernel& operator = (const Kernel& k);

+    bool empty() const;
    bool create(const char* kname, const Program& prog);
-    bool create(const char* kname, const ProgramSource& prog,
-                const String& buildopts, String& errmsg);
+    bool create(const char* kname, const ProgramSource2& prog,
+                const String& buildopts, String* errmsg=0);

-    void set(int i, const void* value, size_t sz);
-    void set(int i, const UMat& m);
-    void set(int i, const KernelArg& arg);
-    template<typename _Tp> void set(int i, const _Tp& value)
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
    { return set(i, &value, sizeof(value)); }

    template<typename _Tp0>
@@ -291,26 +305,27 @@ public:
    template<typename _Tp0, typename _Tp1>
    Kernel& args(const _Tp0& a0, const _Tp1& a1)
    {
-        set(0, a0); set(1, a1); return *this;
+        int i = set(0, a0); set(i, a1); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
    {
-        set(0, a0); set(1, a1); set(2, a2); return *this;
+        int i = set(0, a0); i = set(i, a1); set(i, a2); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); set(i, a4); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2,
@@ -318,8 +333,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
    {
-        set(0, a0); set(1, a1); set(2, a2);
-        set(3, a3); set(4, a4); set(5, a5); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); i = set(i, a4); set(i, a5); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -327,8 +342,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); set(i, a6); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -336,8 +351,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@@ -346,8 +361,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
-        set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@@ -356,8 +371,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); set(i, a9); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -367,8 +382,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); set(i, a10); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -378,13 +393,13 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
    }

-    void run(int dims, size_t offset[], size_t globalsize[],
+    bool run(int dims, size_t globalsize[],
             size_t localsize[], bool sync, const Queue& q=Queue());
-    void runTask(bool sync, const Queue& q=Queue());
+    bool runTask(bool sync, const Queue& q=Queue());

    size_t workGroupSize() const;
    bool compileWorkGroupSize(size_t wsz[]) const;
@@ -401,7 +416,7 @@ class CV_EXPORTS Program
 {
 public:
    Program();
-    Program(const ProgramSource& src,
+    Program(const ProgramSource2& src,
            const String& buildflags, String& errmsg);
    explicit Program(const String& buf);
    Program(const Program& prog);
@@ -409,12 +424,12 @@ public:
    Program& operator = (const Program& prog);
    ~Program();

-    bool create(const ProgramSource& src,
+    bool create(const ProgramSource2& src,
                const String& buildflags, String& errmsg);
    bool read(const String& buf, const String& buildflags);
    bool write(String& buf) const;

-    const ProgramSource& source() const;
+    const ProgramSource2& source() const;
    void* ptr() const;

    String getPrefix() const;
@@ -426,17 +441,17 @@ protected:
 };


-class CV_EXPORTS ProgramSource
+class CV_EXPORTS ProgramSource2
 {
 public:
    typedef uint64 hash_t;

-    ProgramSource();
-    explicit ProgramSource(const String& prog);
-    explicit ProgramSource(const char* prog);
-    ~ProgramSource();
-    ProgramSource(const ProgramSource& prog);
-    ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource2();
+    explicit ProgramSource2(const String& prog);
+    explicit ProgramSource2(const char* prog);
+    ~ProgramSource2();
+    ProgramSource2(const ProgramSource2& prog);
+    ProgramSource2& operator = (const ProgramSource2& prog);

    const String& source() const;
    hash_t hash() const;
@@ -446,6 +461,10 @@ protected:
    Impl* p;
 };

+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* typeToStr(int t);
+CV_EXPORTS const char* memopTypeToStr(int t);
+
 }}

 #endif
--- a/modules/core/include/opencv2/core/ocl_genbase.hpp
+++ b/modules/core/include/opencv2/core/ocl_genbase.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OPENCL_GENBASE_HPP__
+#define __OPENCV_OPENCL_GENBASE_HPP__
+
+namespace cv
+{
+namespace ocl
+{
+
+struct ProgramEntry
+{
+    const char* name;
+    const char* programStr;
+    const char* programHash;
+};
+
+}
+}
+
+#endif
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const
        return;
    }

+    if( _dst.isUMat() )
+    {
+        _dst.create( dims, size.p, type() );
+        UMat dst = _dst.getUMat();
+
+        size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize();
+        for( i = 0; i < (size_t)dims; i++ )
+            sz[i] = size.p[i];
+        sz[dims-1] *= esz;
+        dst.ndoffset(dstofs);
+        dstofs[dims-1] *= esz;
+        dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p);
+        return;
+    }
+
    if( dims <= 2 )
    {
        _dst.create( rows, cols, type() );

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const
    }
 }

+
+int _InputArray::sizend(int* sz, int i) const
+{
+    int j, d=0, k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        const Mat& m = *(const Mat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        const UMat& m = *(const UMat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_MAT && i >= 0 )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const Mat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_UMAT && i >= 0 )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const UMat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else
+    {
+        Size sz2d = size(i);
+        d = 2;
+        if(sz)
+        {
+            sz[0] = sz2d.height;
+            sz[1] = sz2d.width;
+        }
+    }
+
+    return d;
+}
+
+
+bool _InputArray::sameSize(const _InputArray& arr) const
+{
+    int k1 = kind(), k2 = arr.kind();
+    Size sz1;
+
+    if( k1 == MAT )
+    {
+        const Mat* m = ((const Mat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else if( k1 == UMAT )
+    {
+        const UMat* m = ((const UMat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else
+        sz1 = size();
+    if( arr.dims() > 2 )
+        return false;
+    return sz1 == arr.size();
+}
+
+int _InputArray::dims(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const Mat*)obj)->dims;
+    }
+
+    if( k == EXPR )
+    {
+        CV_Assert( i < 0 );
+        return ((const MatExpr*)obj)->a.dims;
+    }
+
+    if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const UMat*)obj)->dims;
+    }
+
+    if( k == MATX )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == STD_VECTOR )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == NONE )
+        return 0;
+
+    if( k == STD_VECTOR_VECTOR )
+    {
+        const std::vector<std::vector<uchar> >& vv = *(const std::vector<std::vector<uchar> >*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+        return 2;
+    }
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+
+        return vv[i].dims;
+    }
+
+    if( k == OPENGL_BUFFER )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == GPU_MAT )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+    
+    if( k == OCL_MAT )
+    {
+        return 2;
+    }
+    
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+}
+
 size_t _InputArray::total(int i) const
 {
    int k = kind();
@@ -1570,6 +1745,61 @@ bool _InputArray::empty() const
        return ((const cuda::CudaMem*)obj)->empty();
 }

+bool _InputArray::isContinuous(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+        return i < 0 ? ((const Mat*)obj)->isContinuous() : true;
+
+    if( k == UMAT )
+        return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
+
+    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+        return true;
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    return false;
+}
+
+void _InputArray::copyTo(const _OutputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        arr.release();
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.copyTo(arr);
+    }
+    else if( k == EXPR )
+    {
+        const MatExpr& e = *((MatExpr*)obj);
+        if( arr.kind() == MAT )
+            arr.getMatRef() = e;
+        else
+            Mat(e).copyTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->copyTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}

 bool _OutputArray::fixedSize() const
 {
@@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i,
    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }

+void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const
+{
+    int sz[CV_MAX_DIM], d = arr.sizend(sz);
+    create(d, sz, mtype);
+}
+
 void _OutputArray::release() const
 {
    CV_Assert(!fixedSize());
@@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
    return *(cuda::CudaMem*)obj;
 }

+void _OutputArray::setTo(const _InputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.setTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->setTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
+
 static _InputOutputArray _none;
 InputOutputArray noArray() { return _none; }


--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+  Usage:
+     after compiling this program user gets a single kernel called KF.
+     the following flags should be passed:
+     1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
+     2) the actual operation performed, one of "-D OP_...", see below the list of operations.
+     2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
+         for some operations, like min/max/and/or/xor it's enough
+     2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
+          -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
+*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#define CV_32S 4
+#define CV_32F 5
+
+#define dstelem *(dstT*)(dstptr + dst_index)
+#define noconvert(x) x
+
+#ifndef workT
+
+    #define srcT1 dstT
+    #define srcT2 dstT
+    #define workT dstT
+    #define srcelem1 *(dstT*)(srcptr1 + src1_index)
+    #define srcelem2 *(dstT*)(srcptr2 + src2_index)
+    #define convertToDT noconvert
+
+#else
+
+    #define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index))
+    #define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index))
+
+#endif
+
+#define EXTRA_PARAMS
+
+#if defined OP_ADD_SAT
+#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2)
+
+#elif defined OP_ADD
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
+
+#elif defined OP_SUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2)
+
+#elif defined OP_SUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
+
+#elif defined OP_RSUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1)
+
+#elif defined OP_RSUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
+
+#elif defined OP_ABSDIFF
+#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2)
+
+#elif defined OP_AND
+#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
+
+#elif defined OP_OR
+#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
+
+#elif defined OP_XOR
+#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
+
+#elif defined OP_NOT
+#define PROCESS_ELEM dstelem = ~srcelem1
+
+#elif defined OP_MIN
+#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
+
+#elif defined OP_MAX
+#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
+
+#elif defined OP_MUL
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
+
+#elif defined OP_MUL_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
+
+#elif defined OP_DIV
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
+
+#elif defined OP_DIV_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
+
+#elif defined OP_RECIP_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e1 = srcelem1, zero = (workT)(0); \
+        dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
+
+#elif defined OP_ADDW
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
+
+#elif defined OP_MAG
+#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
+
+#elif defined OP_PHASE_RADIANS
+#define PROCESS_ELEM \
+        workT tmp = atan2(srcelem2, srcelem1); \
+        if(tmp < 0) tmp += 6.283185307179586232; \
+        dstelem = tmp
+
+#elif defined OP_PHASE_DEGREES
+    #define PROCESS_ELEM \
+    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
+    if(tmp < 0) tmp += 360; \
+    dstelem = tmp
+
+#elif defined OP_EXP
+#define PROCESS_ELEM dstelem = exp(srcelem1)
+
+#elif defined OP_SQRT
+#define PROCESS_ELEM dstelem = sqrt(srcelem1)
+
+#elif defined OP_LOG
+#define PROCESS_ELEM dstelem = log(abs(srcelem1))
+
+#elif defined OP_CMP
+#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
+
+#elif defined OP_CONVERT
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
+
+#elif defined OP_CONVERT_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
+
+#else
+#error "unknown op type"
+#endif
+
+#if defined UNARY_OP || defined MASK_UNARY_OP
+#undef srcelem2
+#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
+    defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
+    defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX
+    #undef EXTRA_PARAMS
+    #define EXTRA_PARAMS , workT srcelem2
+#endif
+#endif
+
+#if defined BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+        //printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem);
+    }
+}
+
+#elif defined MASK_BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#elif defined UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+    }
+}
+
+#elif defined MASK_UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#else
+
+#error "Unknown operation type"
+
+#endif
+
+
+
+
--- a/modules/core/src/opencl/copyset.cl
+++ b/modules/core/src/opencl/copyset.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
+                      __global uchar* dstptr, int dststep, int dstoffset,
+                      int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
+                  int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
+
--- a/modules/core/src/opencl/mulspectrums.cl
+++ b/modules/core/src/opencl/mulspectrums.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the uintel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business uinterruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+typedef float2 cfloat;
+inline cfloat cmulf(cfloat a, cfloat b)
+{
+    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+
+inline cfloat conjf(cfloat a)
+{
+    return (cfloat)( a.x, - a.y );
+}
+
+__kernel void
+mulAndScaleSpectrumsKernel(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], b[idx]);
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
+__kernel void
+mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], conjf(b[idx]));
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
--- a/modules/core/src/opencl/polarcart.cl
+++ b/modules/core/src/opencl/polarcart.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
+                          __global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
--- a/modules/core/src/opencl/reductions.cl
+++ b/modules/core/src/opencl/reductions.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#if FUNC_SUM
+#define FUNC(a, b) b += a;
+#elif FUNC_ABS_SUM
+#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
+#elif FUNC_SQR_SUM
+#define FUNC(a, b) b += a * a;
+#else
+#error No sum function
+#endif
+
+/**************************************Array buffer SUM**************************************/
+
+__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global srcT *src, __global dstT *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+
+   __local dstT localmem_sum[128];
+   dstT sum = (dstT)(0), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = convertToDstT(src[idx]);
+       FUNC(temp, sum);
+   }
+
+   if (lid > 127)
+       localmem_sum[lid - 128] = sum;
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if (lid < 128)
+       localmem_sum[lid] = sum + localmem_sum[lid];
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if (lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+
+   if (lid == 0)
+       dst[gid] = localmem_sum[0];
+}
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -67,6 +67,8 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

+#include "opencl_kernels.hpp"
+
 namespace cv
 {

@@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 };

 inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 {
-    if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
+    if( sc.dims > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
+        return false;
+    int cn = CV_MAT_CN(atype);
+    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
+        return false;
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+}
+
+inline bool checkScalar(InputArray sc, int atype, int sckind, int akind)
+{
+    if( sc.dims() > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
        return false;
    int cn = CV_MAT_CN(atype);
    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
        return false;
-    return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
-           (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
 }

 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
@@ -227,7 +246,10 @@ struct TLSData
    static TLSData* get();
 };

-namespace ocl { MatAllocator* getOpenCLAllocator(); }
+namespace ocl
+{
+    MatAllocator* getOpenCLAllocator();
+}

 }


--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const
    if(!u)
        return hdr;
    UMat::getStdAllocator()->allocate(u, accessFlags);
+    hdr.flags = flags;
    setSize(hdr, dims, size.p, step.p);
    finalizeHdr(hdr);
    hdr.u = u;
@@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const
    CV_Assert(u->data != 0);
    Mat hdr(dims, size.p, type(), u->data + offset, step.p);
    hdr.u = u;
-    hdr.datastart = hdr.data = u->data;
+    hdr.datastart = u->data;
+    hdr.data = hdr.datastart + offset;
    hdr.datalimit = hdr.dataend = u->data + u->size;
    CV_XADD(&hdr.u->refcount, 1);
    return hdr;
@@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const
        void* dsthandle = dst.handle(ACCESS_WRITE);
        if( srchandle == dsthandle && dst.offset == offset )
            return;
-        ndoffset(dstofs);
+        dst.ndoffset(dstofs);
        CV_Assert(u->currAllocator == dst.u->currAllocator);
        u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
    }
@@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const
    CV_Error(Error::StsNotImplemented, "");
 }

+UMat& UMat::setTo(InputArray _value, InputArray _mask)
+{
+    bool haveMask = !_mask.empty();
+    int t = type(), cn = CV_MAT_CN(t);
+    if( dims <= 2 && cn <= 4 && ocl::useOpenCL() )
+    {
+        Mat value = _value.getMat();
+        CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
+        double buf[4];
+        convertAndUnrollScalar(value, t, (uchar*)buf, 1);
+
+        char opts[1024];
+        sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t));
+
+        ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
+        if( !setK.empty() )
+        {
+            ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t));
+            UMat mask;
+
+            if( haveMask )
+            {
+                mask = _mask.getUMat();
+                CV_Assert( mask.size() == size() && mask.type() == CV_8U );
+                ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+                ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this);
+                setK.args(maskarg, dstarg, scalararg);
+            }
+            else
+            {
+                ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this);
+                setK.args(dstarg, scalararg);
+            }
+
+            size_t globalsize[] = { cols, rows };
+            if( setK.run(2, globalsize, 0, false) )
+                return *this;
+        }
+    }
+    Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
+    m.setTo(_value, _mask);
+    return *this;
+}
+
 UMat& UMat::operator = (const Scalar&)
 {
    CV_Error(Error::StsNotImplemented, "");

--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat()
 {
    try
    {
-        Mat a(100, 100, CV_16S), b;
+        Mat a(100, 100, CV_16SC2), b, c;
        randu(a, Scalar::all(-100), Scalar::all(100));
-        Rect roi(1, 3, 10, 20);
-        Mat ra(a, roi), rb;
-        UMat ua, ura;
+        Rect roi(1, 3, 5, 4);
+        Mat ra(a, roi), rb, rc, rc0;
+        UMat ua, ura, ub, urb, uc, urc;
        a.copyTo(ua);
        ua.copyTo(b);
        CHECK_DIFF(a, b);
@@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat()
        }
        ra.copyTo(rb);
        CHECK_DIFF(ra, rb);
+
+        b = a.clone();
+        ra = a(roi);
+        rb = b(roi);
+        randu(b, Scalar::all(-100), Scalar::all(100));
+        b.copyTo(ub);
+        urb = ub(roi);
+
+        /*std::cout << "==============================================\nbefore op (CPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        ra.copyTo(ura);
+        rb.copyTo(urb);
+        ra.release();
+        rb.release();
+        ura.copyTo(ra);
+        urb.copyTo(rb);
+
+        /*std::cout << "==============================================\nbefore op (GPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        cv::max(ra, rb, rc);
+        cv::max(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "rc: " << rc << std::endl;
+        std::cout << "rc0: " << rc0 << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        {
+        UMat tmp = rc0.getUMat(ACCESS_WRITE);
+        cv::max(ura, urb, tmp);
+        }
+        CHECK_DIFF(rc0, rc);
+
+        ura.copyTo(urc);
+        cv::max(urc, urb, urc);
+        urc.copyTo(rc0);
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra ^ rb;
+        cv::bitwise_xor(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "ra: " << rc0 << std::endl;
+        std::cout << "rc: " << rc << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra + rb;
+        cv::add(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
+
+        cv::subtract(ra, Scalar::all(5), rc);
+        cv::subtract(ura, Scalar::all(5), urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
    }
    catch (const test_excep& e)
    {

--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@@ -511,9 +511,10 @@ public:
    CV_WRAP virtual void release();

    CV_WRAP virtual bool grab();
-    CV_WRAP virtual bool retrieve(CV_OUT Mat& image, int flag = 0);
+    CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0);
    virtual VideoCapture& operator >> (CV_OUT Mat& image);
-    CV_WRAP virtual bool read(CV_OUT Mat& image);
+    virtual VideoCapture& operator >> (CV_OUT UMat& image);
+    CV_WRAP virtual bool read(OutputArray image);

    CV_WRAP virtual bool set(int propId, double value);
    CV_WRAP virtual double get(int propId);

--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@@ -515,7 +515,7 @@ bool VideoCapture::grab()
    return cvGrabFrame(cap) != 0;
 }

-bool VideoCapture::retrieve(Mat& image, int channel)
+bool VideoCapture::retrieve(OutputArray image, int channel)
 {
    IplImage* _img = cvRetrieveFrame(cap, channel);
    if( !_img )
@@ -533,7 +533,7 @@ bool VideoCapture::retrieve(Mat& image, int channel)
    return true;
 }

-bool VideoCapture::read(Mat& image)
+bool VideoCapture::read(OutputArray image)
 {
    if(grab())
        retrieve(image);
@@ -548,6 +548,12 @@ VideoCapture& VideoCapture::operator >> (Mat& image)
    return *this;
 }

+VideoCapture& VideoCapture::operator >> (UMat& image)
+{
+    read(image);
+    return *this;
+}
+
 bool VideoCapture::set(int propId, double value)
 {
    return cvSetCaptureProperty(cap, propId, value) != 0;

--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2687,6 +2687,124 @@ struct mRGBA2RGBA
    }
 };

+
+static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
+{
+    bool ok = true;
+    UMat src = _src.getUMat(), dst;
+    Size sz = src.size(), dstSz = sz;
+    int scn = src.channels(), depth = src.depth(), bidx, dtype;
+    size_t globalsize[] = { src.cols, src.rows };
+    ocl::Kernel k;
+
+    if(depth != CV_8U && depth != CV_16U && depth != CV_32F)
+        return false;
+
+    switch (code)
+    {
+    /*
+     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
+     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
+     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
+     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
+     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
+     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
+     */
+    case COLOR_BGR2GRAY:
+    case COLOR_BGRA2GRAY:
+    case COLOR_RGB2GRAY:
+    case COLOR_RGBA2GRAY:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
+        dtype = depth;
+        k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_GRAY2BGR:
+    case COLOR_GRAY2BGRA:
+    {
+        CV_Assert(scn == 1);
+        dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
+        dtype = CV_MAKETYPE(depth, dcn);
+        k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=1 -D dcn=%d", depth, dcn));
+        break;
+    }
+    case COLOR_BGR2YUV:
+    case COLOR_RGB2YUV:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_RGB2YUV ? 0 : 2;
+        k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_YUV2BGR:
+    case COLOR_YUV2RGB:
+    {
+        if(dcn < 0) dcn = 3;
+        CV_Assert(dcn == 3 || dcn == 4);
+        bidx = code == COLOR_YUV2RGB ? 0 : 2;
+        k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+        break;
+    }
+    case COLOR_YUV2RGB_NV12:
+    case COLOR_YUV2BGR_NV12:
+    case COLOR_YUV2RGBA_NV12:
+    case COLOR_YUV2BGRA_NV12:
+    {
+        CV_Assert( scn == 1 );
+        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
+        dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
+        bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
+
+        dstSz = Size(sz.width, sz.height * 2 / 3);
+        globalsize[0] = dstSz.height/2;
+        globalsize[1] = dstSz.width/2;
+        k.create("YUV2RGBA_NV12", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
+        break;
+    }
+    case COLOR_BGR2YCrCb:
+    case COLOR_RGB2YCrCb:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
+        k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_YCrCb2BGR:
+    case COLOR_YCrCb2RGB:
+        break;
+    /*
+     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
+     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
+     case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
+     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
+     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
+     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
+     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
+     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
+     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
+     */
+    default:
+        ;
+    }
+
+    if( !k.empty() )
+    {
+        _dst.create(dstSz, dtype);
+        dst = _dst.getUMat();
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
+        ok = k.run(2, globalsize, 0, false);
+    }
+    return ok;
+}
+
 }//namespace cv

 //////////////////////////////////////////////////////////////////////////////////////////
@@ -2695,9 +2813,15 @@ struct mRGBA2RGBA

 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
+    bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
+    int stype = _src.type();
+    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
+
+    if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
+        return;
+
    Mat src = _src.getMat(), dst;
    Size sz = src.size();
-    int scn = src.channels(), depth = src.depth(), bidx;

    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );


--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -1901,8 +1901,43 @@ private:
 };
 #endif

+static bool ocl_resize( InputArray _src, OutputArray _dst,
+                        double fx, double fy, int interpolation)
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if( !(cn <= 4 &&
+           (interpolation == INTER_NEAREST ||
+           (interpolation == INTER_LINEAR && (depth == CV_8U || depth == CV_32F)))) )
+        return false;
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+    ocl::Kernel k;
+
+    if (interpolation == INTER_LINEAR)
+    {
+        int wdepth = depth == CV_8U ? CV_32S : CV_32F;
+        int wtype = CV_MAKETYPE(wdepth, cn);
+        char buf[2][32];
+        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_LINEAR -D depth=%s -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
+                        ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
+    }
+    else if (interpolation == INTER_NEAREST)
+    {
+        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) ));
+    }
+
+    if( k.empty() )
+        return false;
+    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+           (float)(1./fx), (float)(1./fy));
+    size_t globalsize[] = { dst.cols, dst.rows };
+    return k.run(2, globalsize, 0, false);
 }

+}

 //////////////////////////////////////////////////////////////////////////////////////////

@@ -2013,25 +2048,28 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
        resizeArea_<double, double>, 0
    };

-    Mat src = _src.getMat();
-    Size ssize = src.size();
+    Size ssize = _src.size();

    CV_Assert( ssize.area() > 0 );
    CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
    if( !dsize.area() )
    {
-        dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
-            saturate_cast<int>(src.rows*inv_scale_y));
+        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
+                     saturate_cast<int>(ssize.height*inv_scale_y));
        CV_Assert( dsize.area() );
    }
    else
    {
-        inv_scale_x = (double)dsize.width/src.cols;
-        inv_scale_y = (double)dsize.height/src.rows;
+        inv_scale_x = (double)dsize.width/ssize.width;
+        inv_scale_y = (double)dsize.height/ssize.height;
    }
-    _dst.create(dsize, src.type());
-    Mat dst = _dst.getMat();
+    _dst.create(dsize, _src.type());

+    if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
+        ocl_resize(_src, _dst, inv_scale_x, inv_scale_y, interpolation) )
+        return;
+
+    Mat src = _src.getMat(), dst = _dst.getMat();

 #ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))

--- a/modules/imgproc/src/opencl/bilateral.cl
+++ b/modules/imgproc/src/opencl/bilateral.cl
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Rock Li, Rock.li@amd.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+
+__kernel void bilateral_C1_D0(__global uchar *dst,
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float sum = 0.f, wsum = 0.f;
+
+        int val0 = (int)src[src_index];
+        for(int k = 0; k < maxk; k++ )
+        {
+            int val = (int)src[src_index + space_ofs[k]];
+            float w = space_weight[k] * color_weight[abs(val - val0)];
+            sum += (float)(val) * w;
+            wsum += w;
+        }
+        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
+    }
+}
+
+__kernel void bilateral2_C1_D0(__global uchar *dst,
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float4 sum = (float4)(0.f), wsum = (float4)(0.f);
+
+        int4 val0 = convert_int4(vload4(0,src + src_index));
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
+            float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
+                color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
+            sum += convert_float4(val) * w;
+            wsum += w;
+        }
+        *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
+    }
+}
+
+__kernel void bilateral_C4_D0(__global uchar4 *dst,
+        __global const uchar4 *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float4 sum = (float4)0.f;
+        float wsum = 0.f;
+
+        int4 val0 = convert_int4(src[src_index]);
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(src[src_index + space_ofs[k]]);
+            float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
+            sum += convert_float4(val) * (float4)w;
+            wsum += w;
+        }
+
+        wsum = 1.f / wsum;
+        dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
+    }
+}
--- a/modules/imgproc/src/opencl/boxfilter.cl
+++ b/modules/imgproc/src/opencl/boxfilter.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+//blur function does not support BORDER_WRAP
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
+inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
+                             int dst_rows, int dst_cols,
+                             int dst_startX, int dst_x_off,
+                             float alpha)
+{
+    if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
+    {
+        return;
+    }
+
+    uint4 tmp_sum = 0;
+    int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
+    int posY = (get_group_id(1) << 1);
+
+    for(int i=-anX; i<=anX; i++)
+    {
+        tmp_sum += vload4(get_local_id(0), temp+i);
+    }
+
+    if(posY < dst_rows && posX < dst_cols)
+    {
+        tmp_sum /= (uint4) alpha;
+        if(posX >= 0 && posX < dst_cols)
+            *(dst) = tmp_sum.x;
+        if(posX+1 >= 0 && posX+1 < dst_cols)
+            *(dst + 1) = tmp_sum.y;
+        if(posX+2 >= 0 && posX+2 < dst_cols)
+            *(dst + 2) = tmp_sum.z;
+        if(posX+3 >= 0 && posX+3 < dst_cols)
+            *(dst + 3) = tmp_sum.w;
+    }
+}
+
+
+inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
+                             int dst_rows, int dst_cols,
+                             int dst_startX, int dst_x_off,
+                             float alpha)
+{
+    if(get_local_id(0) >= (THREADS-ksX+1))
+    {
+        return;
+    }
+
+    int posX = dst_startX - dst_x_off + get_local_id(0);
+    int posY = (get_group_id(1) << 1);
+
+    uint4 temp_sum = 0;
+    for(int i=-anX; i<=anX; i++)
+    {
+        temp_sum += temp[get_local_id(0) + anX + i];
+    }
+
+    if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
+        *dst = convert_uchar4(convert_float4(temp_sum)/alpha);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+    int src_x_off = src_offset % src_step;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = dst_offset % dst_step;
+    int dst_y_off = dst_offset / dst_step;
+
+    int head_off = dst_x_off%4;
+    int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    uint4 data[ksY+1];
+    __local uint4 temp[2][THREADS];
+
+#ifdef BORDER_CONSTANT
+
+    for(int i=0; i < ksY+1; i++)
+    {
+        if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
+        {
+            data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
+            data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
+            data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
+            data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
+        }
+        else
+        {
+            data[i]=0;
+            int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
+            if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
+            if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
+            if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
+            if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
+        }
+    }
+
+#else
+    int not_all_in_range;
+    for(int i=0; i < ksY+1; i++)
+    {
+        not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
+                           | (startY+i<0) | (startY+i>src_whole_rows-1);
+        if(not_all_in_range)
+        {
+            int selected_row;
+            int4 selected_col;
+            selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+            selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+            selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
+            selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
+
+            selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
+            selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
+
+            selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
+            selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
+
+            selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
+            selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
+
+            data[i].x = *(src + selected_row * src_step + selected_col.x);
+            data[i].y = *(src + selected_row * src_step + selected_col.y);
+            data[i].z = *(src + selected_row * src_step + selected_col.z);
+            data[i].w = *(src + selected_row * src_step + selected_col.w);
+        }
+        else
+        {
+            data[i] =  convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
+        }
+    }
+#endif
+    uint4 tmp_sum = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        tmp_sum += (data[i]);
+    }
+
+    int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
+
+    temp[0][col] = tmp_sum + (data[0]);
+    temp[1][col] = tmp_sum + (data[ksY]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+    update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    uint4 data[ksY+1];
+    __local uint4 temp[2][THREADS];
+
+#ifdef BORDER_CONSTANT
+    bool con;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+
+        data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
+        data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
+        data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
+        data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+
+        data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
+    }
+
+#endif
+    uint4 tmp_sum = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        tmp_sum += (data[i]);
+    }
+
+    int index = dst_startY * (dst_step>>2)+ dst_startX + col;
+
+    temp[0][col] = tmp_sum + (data[0]);
+    temp[1][col] = tmp_sum + (data[ksY]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+    update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+    float data[ksY+1];
+    __local float temp[2][THREADS];
+#ifdef BORDER_CONSTANT
+    bool con;
+    float ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
+
+        data[i] = con ? ss : 0.f;
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+        data[i] = src[selected_row * (src_step>>2) + selected_col];
+    }
+
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+
+        float tmp_sum[2]= {0.0, 0.0};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 4;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 4;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+    float4 data[ksY+1];
+    __local float4 temp[2][THREADS];
+#ifdef BORDER_CONSTANT
+    bool con;
+    float4 ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
+
+        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+        data[i] = src[selected_row * (src_step>>4) + selected_col];
+    }
+
+#endif
+    float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+
+        float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+
+    }
+}
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+#ifdef L2GRAD
+inline float calc(int x, int y)
+{
+    return sqrt((float)(x * x + y * y));
+}
+#else
+inline float calc(int x, int y)
+{
+    return (float)abs(x) + abs(y);
+}
+#endif //
+
+// Smoothing perpendicular to the derivative direction with a triangle filter
+// only support 3x3 Sobel kernel
+// h (-1) =  1, h (0) =  2, h (1) =  1
+// h'(-1) = -1, h'(0) =  0, h'(1) =  1
+// thus sobel 2D operator can be calculated as:
+// h'(x, y) = h'(x)h(y) for x direction
+//
+// src		input 8bit single channel image data
+// dx_buf	output dx buffer
+// dy_buf	output dy buffer
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcSobelRowPass
+(
+    __global const uchar * src,
+    __global int * dx_buf,
+    __global int * dy_buf,
+    int rows,
+    int cols,
+    int src_step,
+    int src_offset,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset
+)
+{
+    dx_buf_step   /= sizeof(*dx_buf);
+    dx_buf_offset /= sizeof(*dx_buf);
+    dy_buf_step   /= sizeof(*dy_buf);
+    dy_buf_offset /= sizeof(*dy_buf);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int smem[16][18];
+
+    smem[lidy][lidx + 1] =
+        src[gidx + min(gidy, rows - 1) * src_step + src_offset];
+    if(lidx == 0)
+    {
+        smem[lidy][0]  =
+            src[max(gidx - 1,  0)        + min(gidy, rows - 1) * src_step + src_offset];
+        smem[lidy][17] =
+            src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
+            -smem[lidy][lidx] + smem[lidy][lidx + 2];
+        dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
+            smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+    }
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the buffered version(3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcMagnitude_buf
+(
+    __global const int * dx_buf,
+    __global const int * dy_buf,
+    __global int * dx,
+    __global int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+)
+{
+    dx_buf_step    /= sizeof(*dx_buf);
+    dx_buf_offset  /= sizeof(*dx_buf);
+    dy_buf_step    /= sizeof(*dy_buf);
+    dy_buf_offset  /= sizeof(*dy_buf);
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int sdx[18][16];
+    __local int sdy[18][16];
+
+    sdx[lidy + 1][lidx] =
+        dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
+    sdy[lidy + 1][lidx] =
+        dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    if(lidy == 0)
+    {
+        sdx[0][lidx]  =
+            dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
+        sdx[17][lidx] =
+            dx_buf[gidx + min(gidy + 16, rows - 1)  * dx_buf_step + dx_buf_offset];
+
+        sdy[0][lidx]  =
+            dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
+        sdy[17][lidx] =
+            dy_buf[gidx + min(gidy + 16, rows - 1)  * dy_buf_step + dy_buf_offset];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidx < cols && gidy < rows)
+    {
+        int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+        int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+
+        dx[gidx + gidy * dx_step + dx_offset] = x;
+        dy[gidx + gidy * dy_step + dy_offset] = y;
+
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+    }
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the non-buffered version(non-3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void calcMagnitude
+(
+    __global const int * dx,
+    __global const int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+)
+{
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
+            calc(
+                dx[gidx + gidy * dx_step + dx_offset],
+                dy[gidx + gidy * dy_step + dy_offset]
+            );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// 0.4142135623730950488016887242097 is tan(22.5)
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+//First pass of edge detection and non-maximum suppression
+// edgetype is set to for each pixel:
+// 0 - below low thres, not an edge
+// 1 - maybe an edge
+// 2 - is an edge, either magnitude is greater than high thres, or
+//     Given estimates of the image gradients, a search is then carried out
+//     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
+//     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
+//     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
+//     if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
+//     if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
+//
+// dx, dy		direvitives of x and y direction
+// mag			magnitudes calculated from calcMagnitude function
+// map			output containing raw edge types
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcMap
+(
+    __global const int * dx,
+    __global const int * dy,
+    __global const float * mag,
+    __global int * map,
+    int rows,
+    int cols,
+    float low_thresh,
+    float high_thresh,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset,
+    int map_step,
+    int map_offset
+)
+{
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    mag += mag_offset;
+    map += map_offset;
+
+    __local float smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] =
+            mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] =
+            mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int x = dx[gidx + gidy * dx_step];
+        int y = dy[gidx + gidy * dy_step];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = smem[lidy + 1][lidx + 1];
+        x = abs(x);
+        y = abs(y);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        if(m > low_thresh)
+        {
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+            y <<= CANNY_SHIFT;
+            if(y < tg22x)
+            {
+                if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else if (y > tg67x)
+            {
+                if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else
+            {
+                if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+    }
+}
+
+#undef CANNY_SHIFT
+#undef TG22
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// do Hysteresis for pixel whose edge type is 1
+//
+// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
+// marked as edge. Each thread will iterate for 16 times to connect local edges.
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
+// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
+//
+// map		raw edge type results calculated from calcMap.
+// st		the potiential edge points found in this kernel call
+// counter	the number of potiential edge points
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+edgesHysteresisLocal
+(
+    __global int * map,
+    __global ushort2 * st,
+    __global unsigned int * counter,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset
+)
+{
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    map += map_offset;
+
+    __local int smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] =
+            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] =
+            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[lidy + 1][lidx + 1] == 1)
+            {
+                n += smem[lidy    ][lidx    ] == 2;
+                n += smem[lidy    ][lidx + 1] == 2;
+                n += smem[lidy    ][lidx + 2] == 2;
+
+                n += smem[lidy + 1][lidx    ] == 2;
+                n += smem[lidy + 1][lidx + 2] == 2;
+
+                n += smem[lidy + 2][lidx    ] == 2;
+                n += smem[lidy + 2][lidx + 1] == 2;
+                n += smem[lidy + 2][lidx + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[lidy + 1][lidx + 1] = 2;
+        }
+
+        const int e = smem[lidy + 1][lidx + 1];
+        map[gidx + 1 + (gidy + 1) * map_step] = e;
+
+        n = 0;
+        if(e == 2)
+        {
+            n += smem[lidy    ][lidx    ] == 1;
+            n += smem[lidy    ][lidx + 1] == 1;
+            n += smem[lidy    ][lidx + 2] == 1;
+
+            n += smem[lidy + 1][lidx    ] == 1;
+            n += smem[lidy + 1][lidx + 2] == 1;
+
+            n += smem[lidy + 2][lidx    ] == 1;
+            n += smem[lidy + 2][lidx + 1] == 1;
+            n += smem[lidy + 2][lidx + 2] == 1;
+        }
+
+        if(n > 0)
+        {
+            unsigned int ind = atomic_inc(counter);
+            st[ind] = (ushort2)(gidx + 1, gidy + 1);
+        }
+    }
+}
+
+__constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+__constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+
+#define stack_size 512
+__kernel
+void
+__attribute__((reqd_work_group_size(128,1,1)))
+edgesHysteresisGlobal
+(
+    __global int * map,
+    __global ushort2 * st1,
+    __global ushort2 * st2,
+    __global int * counter,
+    int rows,
+    int cols,
+    int count,
+    int map_step,
+    int map_offset
+)
+{
+
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    map += map_offset;
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_group_id(0);
+    int grp_idy = get_group_id(1);
+
+    __local unsigned int s_counter;
+    __local unsigned int s_ind;
+
+    __local ushort2 s_st[stack_size];
+
+    if(lidx == 0)
+    {
+        s_counter = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
+
+    if(ind < count)
+    {
+        ushort2 pos = st1[ind];
+        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        {
+            if (lidx < 8)
+            {
+                pos.x += c_dx[lidx];
+                pos.y += c_dy[lidx];
+
+                if (map[pos.x + pos.y * map_step] == 1)
+                {
+                    map[pos.x + pos.y * map_step] = 2;
+
+                    ind = atomic_inc(&s_counter);
+
+                    s_st[ind] = pos;
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
+            {
+                const int subTaskIdx = lidx >> 3;
+                const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+
+                pos.x = pos.y = 0;
+
+                if (subTaskIdx < portion)
+                    pos = s_st[s_counter - 1 - subTaskIdx];
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (lidx == 0)
+                    s_counter -= portion;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                {
+                    pos.x += c_dx[lidx & 7];
+                    pos.y += c_dy[lidx & 7];
+
+                    if (map[pos.x + pos.y * map_step] == 1)
+                    {
+                        map[pos.x + pos.y * map_step] = 2;
+
+                        ind = atomic_inc(&s_counter);
+
+                        s_st[ind] = pos;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if (s_counter > 0)
+            {
+                if (lidx == 0)
+                {
+                    ind = atomic_add(counter, s_counter);
+                    s_ind = ind - s_counter;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                ind = s_ind;
+
+                for (int i = lidx; i < s_counter; i += get_local_size(0))
+                {
+                    st2[ind + i] = s_st[i];
+                }
+            }
+        }
+    }
+}
+#undef stack_size
+
+//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
+// map		edge type mappings
+// dst		edge output
+__kernel
+void getEdges
+(
+    __global const int * map,
+    __global uchar * dst,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset,
+    int dst_step,
+    int dst_offset
+)
+{
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
+    }
+}
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+
+int calc_lut(__local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid == 0)
+        for (int i = 1; i < 256; ++i)
+            smem[i] += smem[i - 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    return smem[tid];
+}
+
+#ifdef CPU
+void reduce(volatile __local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+        smem[tid] += smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+        smem[tid] += smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+        smem[tid] += smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 4)
+        smem[tid] += smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 2)
+        smem[tid] += smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 1)
+        smem[256] = smem[tid] + smem[tid + 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+#else
+
+void reduce(__local volatile int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+        smem[tid] += smem[tid + 4];
+        smem[tid] += smem[tid + 2];
+        smem[tid] += smem[tid + 1];
+    }
+}
+#endif
+
+__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
+                      const int srcStep, const int dstStep,
+                      const int2 tileSize, const int tilesX,
+                      const int clipLimit, const float lutScale,
+                      const int src_offset, const int dst_offset)
+{
+    __local int smem[512];
+
+    const int tx = get_group_id(0);
+    const int ty = get_group_id(1);
+    const unsigned int tid = get_local_id(1) * get_local_size(0)
+                             + get_local_id(0);
+
+    smem[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
+    {
+        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
+        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
+        {
+            const int data = srcPtr[j];
+            atomic_inc(&smem[data]);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tHistVal = smem[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (clipLimit > 0)
+    {
+        // clip histogram bar
+        int clipped = 0;
+        if (tHistVal > clipLimit)
+        {
+            clipped = tHistVal - clipLimit;
+            tHistVal = clipLimit;
+        }
+
+        // find number of overall clipped samples
+        reduce(smem, clipped, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef CPU
+        clipped = smem[256];
+#else
+        clipped = smem[0];
+#endif
+
+        // broadcast evaluated value
+
+        __local int totalClipped;
+
+        if (tid == 0)
+            totalClipped = clipped;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // redistribute clipped samples evenly
+
+        int redistBatch = totalClipped / 256;
+        tHistVal += redistBatch;
+
+        int residual = totalClipped - redistBatch * 256;
+        if (tid < residual)
+            ++tHistVal;
+    }
+
+    const int lutVal = calc_lut(smem, tHistVal, tid);
+    uint ires = (uint)convert_int_rte(lutScale * lutVal);
+    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
+        convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
+
+__kernel void transform(__global __const uchar * src,
+                        __global uchar * dst,
+                        __global uchar * lut,
+                        const int srcStep, const int dstStep, const int lutStep,
+                        const int cols, const int rows,
+                        const int2 tileSize,
+                        const int tilesX, const int tilesY,
+                        const int src_offset, const int dst_offset, int lut_offset)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
+    int ty1 = convert_int_rtn(tyf);
+    int ty2 = ty1 + 1;
+    const float ya = tyf - ty1;
+    ty1 = max(ty1, 0);
+    ty2 = min(ty2, tilesY - 1);
+
+    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
+    int tx1 = convert_int_rtn(txf);
+    int tx2 = tx1 + 1;
+    const float xa = txf - tx1;
+    tx1 = max(tx1, 0);
+    tx2 = min(tx2, tilesX - 1);
+
+    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
+
+    float res = 0;
+
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
+
+    uint ires = (uint)convert_int_rte(res);
+    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
--- a/modules/imgproc/src/opencl/convolve.cl
+++ b/modules/imgproc/src/opencl/convolve.cl
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+/************************************** convolve **************************************/
+
+__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
+                          int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
+                          int src_offset, int dst_offset, int koffset)
+{
+    __local float smem[16 + 2 * 8][16 + 2 * 8];
+
+    int x = get_local_id(0);
+    int y = get_local_id(1);
+    int gx = get_global_id(0);
+    int gy = get_global_id(1);
+
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+    smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
+
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+    smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gx < cols && gy < rows)
+    {
+        float res = 0;
+
+        for (int i = 0; i < kHeight; ++i)
+            for (int j = 0; j < kWidth; ++j)
+                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
+
+        dst[gy * dst_step + gx + dst_offset] = res;
+    }
+}
--- a/modules/imgproc/src/opencl/copymakeborder.cl
+++ b/modules/imgproc/src/opencl/copymakeborder.cl
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
--- a/modules/imgproc/src/opencl/gftt.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
--- a/modules/imgproc/src/opencl/harris.cl
+++ b/modules/imgproc/src/opencl/harris.cl
--- a/modules/imgproc/src/opencl/histogram.cl
+++ b/modules/imgproc/src/opencl/histogram.cl
--- a/modules/imgproc/src/opencl/hough.cl
+++ b/modules/imgproc/src/opencl/hough.cl
--- a/modules/imgproc/src/opencl/integral.cl
+++ b/modules/imgproc/src/opencl/integral.cl
--- a/modules/imgproc/src/opencl/integral_sum.cl
+++ b/modules/imgproc/src/opencl/integral_sum.cl
--- a/modules/imgproc/src/opencl/laplacian.cl
+++ b/modules/imgproc/src/opencl/laplacian.cl
--- a/modules/imgproc/src/opencl/match_template.cl
+++ b/modules/imgproc/src/opencl/match_template.cl
--- a/modules/imgproc/src/opencl/median.cl
+++ b/modules/imgproc/src/opencl/median.cl
--- a/modules/imgproc/src/opencl/mineigenval.cl
+++ b/modules/imgproc/src/opencl/mineigenval.cl
--- a/modules/imgproc/src/opencl/moments.cl
+++ b/modules/imgproc/src/opencl/moments.cl
--- a/modules/imgproc/src/opencl/morph.cl
+++ b/modules/imgproc/src/opencl/morph.cl
--- a/modules/imgproc/src/opencl/pyramid.cl
+++ b/modules/imgproc/src/opencl/pyramid.cl
--- a/modules/imgproc/src/opencl/remap.cl
+++ b/modules/imgproc/src/opencl/remap.cl
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
--- a/modules/imgproc/src/opencl/threshold.cl
+++ b/modules/imgproc/src/opencl/threshold.cl
--- a/modules/imgproc/src/opencl/warpaffine.cl
+++ b/modules/imgproc/src/opencl/warpaffine.cl
--- a/modules/imgproc/src/opencl/warpperspective.cl
+++ b/modules/imgproc/src/opencl/warpperspective.cl
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@@ -48,6 +48,8 @@

 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencl_kernels.hpp"

 #include <math.h>
 #include <assert.h>

--- a/modules/imgproc/test/test_imgproc_umat.cpp
+++ b/modules/imgproc/test/test_imgproc_umat.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include <string>
+
+using namespace cv;
+using namespace std;
+
+class CV_ImgprocUMatTest : public cvtest::BaseTest
+{
+public:
+    CV_ImgprocUMatTest() {}
+    ~CV_ImgprocUMatTest() {}
+protected:
+    void run(int)
+    {
+        string imgpath = string(ts->get_data_path()) + "shared/lena.png";
+        Mat img = imread(imgpath, 1), gray, smallimg, result;
+        UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult;
+
+        cvtColor(img, gray, COLOR_BGR2GRAY);
+        resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR);
+        equalizeHist(smallimg, result);
+
+        cvtColor(uimg, ugray, COLOR_BGR2GRAY);
+        resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR);
+        equalizeHist(usmallimg, uresult);
+
+        imshow("orig", uimg);
+        imshow("small", usmallimg);
+        imshow("equalized gray", uresult);
+        waitKey();
+        destroyWindow("orig");
+        destroyWindow("small");
+        destroyWindow("equalized gray");
+
+        ts->set_failed_test_info(cvtest::TS::OK);
+    }
+};
+
+TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); }
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
--- a/modules/objdetect/src/opencl/haarobjectdetect.cl
+++ b/modules/objdetect/src/opencl/haarobjectdetect.cl
--- a/modules/objdetect/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/objdetect/src/opencl/haarobjectdetect_scaled2.cl
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ b/modules/superres/src/btv_l1_ocl.cpp
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
--- a/samples/cpp/ufacedetect.cpp
+++ b/samples/cpp/ufacedetect.cpp
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp