提交 d914f20a 编写于 作者: V Vadim Pisarevsky

updated patch to bring in the first functions with "transparent API"

上级 bb4bf7a1
......@@ -501,9 +501,10 @@ macro(ocv_glob_module_sources)
file(GLOB cl_kernels "src/opencl/*.cl")
if(HAVE_opencv_ocl AND cl_kernels)
string(REGEX REPLACE "opencv_" "" the_module_barename "${the_module}")
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
COMMAND ${CMAKE_COMMAND} -DMODULE_NAME="${the_module_barename}" -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
......@@ -4,6 +4,15 @@ list(SORT cl_list)
string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)
set(nested_namespace_start "")
set(nested_namespace_end "")
set(new_mode ON)
set(nested_namespace_start "namespace ${MODULE_NAME}\n{")
set(nested_namespace_end "}")
set(STR_CPP "// This file is auto-generated. Do not edit!
#include \"precomp.hpp\"
......@@ -13,16 +22,19 @@ namespace cv
namespace ocl
set(STR_HPP "// This file is auto-generated. Do not edit!
#include \"opencv2/ocl/private/util.hpp\"
#include \"opencv2/core/ocl_genbase.hpp\"
namespace cv
namespace ocl
......@@ -49,12 +61,19 @@ foreach(cl ${cl_list})
string(MD5 hash "${lines}")
set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n")
set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n")
set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n")
set(STR_CPP "${STR_CPP}}\n}\n")
set(STR_HPP "${STR_HPP}}\n}\n")
set(STR_CPP "${STR_CPP}}\n${nested_namespace_end}}\n")
set(STR_HPP "${STR_HPP}}\n${nested_namespace_end}}\n")
file(WRITE "${OUTPUT}" "${STR_CPP}")
......@@ -47,6 +47,7 @@
#include "opencv2/bioinspired.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/core/private.hpp"
#include "opencv2/core/ocl.hpp"
#include <valarray>
......@@ -56,6 +56,8 @@
namespace cv
static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel;
namespace bioinspired
namespace ocl
......@@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);
//! computes square root of each matrix element (dst = src**0.5)
CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);
......@@ -58,6 +58,8 @@ namespace cv
enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
class CV_EXPORTS _OutputArray;
//////////////////////// Input/Output Array Arguments /////////////////////////////////
......@@ -116,12 +118,22 @@ public:
void* getObj() const;
virtual int kind() const;
virtual int dims(int i=-1) const;
virtual Size size(int i=-1) const;
virtual int sizend(int* sz, int i=-1) const;
virtual bool sameSize(const _InputArray& arr) const;
virtual size_t total(int i=-1) const;
virtual int type(int i=-1) const;
virtual int depth(int i=-1) const;
virtual int channels(int i=-1) const;
virtual bool isContinuous(int i=-1) const;
virtual bool empty() const;
virtual void copyTo(const _OutputArray& arr) const;
bool isMat() const;
bool isUMat() const;
bool isMatVectot() const;
bool isUMatVector() const;
bool isMatx();
virtual ~_InputArray();
......@@ -197,8 +209,10 @@ public:
virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
virtual void createSameSize(const _InputArray& arr, int mtype) const;
virtual void release() const;
virtual void clear() const;
virtual void setTo(const _InputArray& value) const;
......@@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)
inline _InputArray::~_InputArray() {}
inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
inline bool _InputArray::isUMat() const { return kind() == _InputArray::UMAT; }
inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
inline bool _InputArray::isUMatVector() const { return kind() == _InputArray::STD_VECTOR_UMAT; }
inline bool _InputArray::isMatx() { return kind() == _InputArray::MATX; }
inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }
......@@ -49,13 +49,13 @@ namespace cv { namespace ocl {
CV_EXPORTS bool haveOpenCL();
CV_EXPORTS bool useOpenCL();
CV_EXPORTS void setUseOpenCL(bool flag);
CV_EXPORTS void finish();
CV_EXPORTS void finish2();
class CV_EXPORTS Context;
class CV_EXPORTS Context2;
class CV_EXPORTS Device;
class CV_EXPORTS Kernel;
class CV_EXPORTS Program;
class CV_EXPORTS ProgramSource;
class CV_EXPORTS ProgramSource2;
class CV_EXPORTS Queue;
class CV_EXPORTS Device
......@@ -199,22 +199,22 @@ protected:
class CV_EXPORTS Context
class CV_EXPORTS Context2
explicit Context(int dtype);
Context(const Context& c);
Context& operator = (const Context& c);
explicit Context2(int dtype);
Context2(const Context2& c);
Context2& operator = (const Context2& c);
bool create(int dtype);
size_t ndevices() const;
const Device& device(size_t idx) const;
Program getProg(const ProgramSource& prog,
Program getProg(const ProgramSource2& prog,
const String& buildopt, String& errmsg);
static Context& getDefault();
static Context2& getDefault();
void* ptr() const;
struct Impl;
......@@ -226,12 +226,12 @@ class CV_EXPORTS Queue
explicit Queue(const Context& c, const Device& d=Device());
explicit Queue(const Context2& c, const Device& d=Device());
Queue(const Queue& q);
Queue& operator = (const Queue& q);
bool create(const Context& c=Context(), const Device& d=Device());
bool create(const Context2& c=Context2(), const Device& d=Device());
void finish();
void* ptr() const;
static Queue& getDefault();
......@@ -245,41 +245,55 @@ protected:
class CV_EXPORTS KernelArg
KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
static KernelArg Local() { return KernelArg(LOCAL, 0); }
static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
static KernelArg ReadWrite(const UMat& m, int wscale=1)
{ return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
{ return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg ReadOnly(const UMat& m, int wscale=1)
{ return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
static KernelArg WriteOnly(const UMat& m, int wscale=1)
{ return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
{ return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
{ return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
static KernelArg Constant(const Mat& m);
template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
{ return KernelArg(CONSTANT, 0, (void*)arr, n); }
{ return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }
int flags;
UMat* m;
void* obj;
const void* obj;
size_t sz;
int wscale;
class CV_EXPORTS Kernel
Kernel(const char* kname, const Program& prog);
Kernel(const char* kname, const ProgramSource& prog,
const String& buildopts, String& errmsg);
Kernel(const char* kname, const ProgramSource2& prog,
const String& buildopts, String* errmsg=0);
Kernel(const Kernel& k);
Kernel& operator = (const Kernel& k);
bool empty() const;
bool create(const char* kname, const Program& prog);
bool create(const char* kname, const ProgramSource& prog,
const String& buildopts, String& errmsg);
bool create(const char* kname, const ProgramSource2& prog,
const String& buildopts, String* errmsg=0);
void set(int i, const void* value, size_t sz);
void set(int i, const UMat& m);
void set(int i, const KernelArg& arg);
template<typename _Tp> void set(int i, const _Tp& value)
int set(int i, const void* value, size_t sz);
int set(int i, const UMat& m);
int set(int i, const KernelArg& arg);
template<typename _Tp> int set(int i, const _Tp& value)
{ return set(i, &value, sizeof(value)); }
template<typename _Tp0>
......@@ -291,26 +305,27 @@ public:
template<typename _Tp0, typename _Tp1>
Kernel& args(const _Tp0& a0, const _Tp1& a1)
set(0, a0); set(1, a1); return *this;
int i = set(0, a0); set(i, a1); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
set(0, a0); set(1, a1); set(2, a2); return *this;
int i = set(0, a0); i = set(i, a1); set(i, a2); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
const _Tp3& a3, const _Tp4& a4)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2);
i = set(i, a3); set(i, a4); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2,
......@@ -318,8 +333,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
set(0, a0); set(1, a1); set(2, a2);
set(3, a3); set(4, a4); set(5, a5); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2);
i = set(i, a3); i = set(i, a4); set(i, a5); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
......@@ -327,8 +342,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
set(0, a0); set(1, a1); set(2, a2); set(3, a3);
set(4, a4); set(5, a5); set(6, a6); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
i = set(i, a4); i = set(i, a5); set(i, a6); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
......@@ -336,8 +351,8 @@ public:
Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
set(0, a0); set(1, a1); set(2, a2); set(3, a3);
set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
i = set(i, a4); i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
......@@ -346,8 +361,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
......@@ -356,8 +371,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); set(i, a9); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
......@@ -367,8 +382,8 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); set(i, a10); return *this;
template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
......@@ -378,13 +393,13 @@ public:
const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
void run(int dims, size_t offset[], size_t globalsize[],
bool run(int dims, size_t globalsize[],
size_t localsize[], bool sync, const Queue& q=Queue());
void runTask(bool sync, const Queue& q=Queue());
bool runTask(bool sync, const Queue& q=Queue());
size_t workGroupSize() const;
bool compileWorkGroupSize(size_t wsz[]) const;
......@@ -401,7 +416,7 @@ class CV_EXPORTS Program
Program(const ProgramSource& src,
Program(const ProgramSource2& src,
const String& buildflags, String& errmsg);
explicit Program(const String& buf);
Program(const Program& prog);
......@@ -409,12 +424,12 @@ public:
Program& operator = (const Program& prog);
bool create(const ProgramSource& src,
bool create(const ProgramSource2& src,
const String& buildflags, String& errmsg);
bool read(const String& buf, const String& buildflags);
bool write(String& buf) const;
const ProgramSource& source() const;
const ProgramSource2& source() const;
void* ptr() const;
String getPrefix() const;
......@@ -426,17 +441,17 @@ protected:
class CV_EXPORTS ProgramSource
class CV_EXPORTS ProgramSource2
typedef uint64 hash_t;
explicit ProgramSource(const String& prog);
explicit ProgramSource(const char* prog);
ProgramSource(const ProgramSource& prog);
ProgramSource& operator = (const ProgramSource& prog);
explicit ProgramSource2(const String& prog);
explicit ProgramSource2(const char* prog);
ProgramSource2(const ProgramSource2& prog);
ProgramSource2& operator = (const ProgramSource2& prog);
const String& source() const;
hash_t hash() const;
......@@ -446,6 +461,10 @@ protected:
Impl* p;
CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS const char* memopTypeToStr(int t);
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the OpenCV Foundation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
namespace cv
namespace ocl
struct ProgramEntry
const char* name;
const char* programStr;
const char* programHash;
......@@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const
if( _dst.isUMat() )
_dst.create( dims, size.p, type() );
UMat dst = _dst.getUMat();
size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize();
for( i = 0; i < (size_t)dims; i++ )
sz[i] = size.p[i];
sz[dims-1] *= esz;
dstofs[dims-1] *= esz;
dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p);
if( dims <= 2 )
_dst.create( rows, cols, type() );
......@@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const
int _InputArray::sizend(int* sz, int i) const
int j, d=0, k = kind();
if( k == NONE )
else if( k == MAT )
CV_Assert( i < 0 );
const Mat& m = *(const Mat*)obj;
d = m.dims;
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
else if( k == UMAT )
CV_Assert( i < 0 );
const UMat& m = *(const UMat*)obj;
d = m.dims;
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
else if( k == STD_VECTOR_MAT && i >= 0 )
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
CV_Assert( i < (int)vv.size() );
const Mat& m = vv[i];
d = m.dims;
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
else if( k == STD_VECTOR_UMAT && i >= 0 )
const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
CV_Assert( i < (int)vv.size() );
const UMat& m = vv[i];
d = m.dims;
for(j = 0; j < d; j++)
sz[j] = m.size.p[j];
Size sz2d = size(i);
d = 2;
sz[0] = sz2d.height;
sz[1] = sz2d.width;
return d;
bool _InputArray::sameSize(const _InputArray& arr) const
int k1 = kind(), k2 = arr.kind();
Size sz1;
if( k1 == MAT )
const Mat* m = ((const Mat*)obj);
if( k2 == MAT )
return m->size == ((const Mat*)arr.obj)->size;
if( k2 == UMAT )
return m->size == ((const UMat*)arr.obj)->size;
if( m->dims > 2 )
return false;
sz1 = m->size();
else if( k1 == UMAT )
const UMat* m = ((const UMat*)obj);
if( k2 == MAT )
return m->size == ((const Mat*)arr.obj)->size;
if( k2 == UMAT )
return m->size == ((const UMat*)arr.obj)->size;
if( m->dims > 2 )
return false;
sz1 = m->size();
sz1 = size();
if( arr.dims() > 2 )
return false;
return sz1 == arr.size();
int _InputArray::dims(int i) const
int k = kind();
if( k == MAT )
CV_Assert( i < 0 );
return ((const Mat*)obj)->dims;
if( k == EXPR )
CV_Assert( i < 0 );
return ((const MatExpr*)obj)->a.dims;
if( k == UMAT )
CV_Assert( i < 0 );
return ((const UMat*)obj)->dims;
if( k == MATX )
CV_Assert( i < 0 );
return 2;
if( k == STD_VECTOR )
CV_Assert( i < 0 );
return 2;
if( k == NONE )
return 0;
const std::vector<std::vector<uchar> >& vv = *(const std::vector<std::vector<uchar> >*)obj;
if( i < 0 )
return 1;
CV_Assert( i < (int)vv.size() );
return 2;
if( k == STD_VECTOR_MAT )
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
if( i < 0 )
return 1;
CV_Assert( i < (int)vv.size() );
return vv[i].dims;
if( k == OPENGL_BUFFER )
CV_Assert( i < 0 );
return 2;
if( k == GPU_MAT )
CV_Assert( i < 0 );
return 2;
if( k == OCL_MAT )
return 2;
CV_Assert( k == CUDA_MEM );
//if( k == CUDA_MEM )
CV_Assert( i < 0 );
return 2;
size_t _InputArray::total(int i) const
int k = kind();
......@@ -1570,6 +1745,61 @@ bool _InputArray::empty() const
return ((const cuda::CudaMem*)obj)->empty();
bool _InputArray::isContinuous(int i) const
int k = kind();
if( k == MAT )
return i < 0 ? ((const Mat*)obj)->isContinuous() : true;
if( k == UMAT )
return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
return true;
if( k == STD_VECTOR_MAT )
const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
CV_Assert((size_t)i < vv.size());
return vv[i].isContinuous();
if( k == STD_VECTOR_UMAT )
const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
CV_Assert((size_t)i < vv.size());
return vv[i].isContinuous();
CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
return false;
void _InputArray::copyTo(const _OutputArray& arr) const
int k = kind();
if( k == NONE )
else if( k == MAT || k == MATX || k == STD_VECTOR )
Mat m = getMat();
else if( k == EXPR )
const MatExpr& e = *((MatExpr*)obj);
if( arr.kind() == MAT )
arr.getMatRef() = e;
else if( k == UMAT )
CV_Error(Error::StsNotImplemented, "");
bool _OutputArray::fixedSize() const
......@@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i,
CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const
int sz[CV_MAX_DIM], d = arr.sizend(sz);
create(d, sz, mtype);
void _OutputArray::release() const
......@@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
return *(cuda::CudaMem*)obj;
void _OutputArray::setTo(const _InputArray& arr) const
int k = kind();
if( k == NONE )
else if( k == MAT || k == MATX || k == STD_VECTOR )
Mat m = getMat();
else if( k == UMAT )
CV_Error(Error::StsNotImplemented, "");
static _InputOutputArray _none;
InputOutputArray noArray() { return _none; }
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
after compiling this program user gets a single kernel called KF.
the following flags should be passed:
2) the actual operation performed, one of "-D OP_...", see below the list of operations.
2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
for some operations, like min/max/and/or/xor it's enough
2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
-D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#define CV_32S 4
#define CV_32F 5
#define dstelem *(dstT*)(dstptr + dst_index)
#define noconvert(x) x
#ifndef workT
#define srcT1 dstT
#define srcT2 dstT
#define workT dstT
#define srcelem1 *(dstT*)(srcptr1 + src1_index)
#define srcelem2 *(dstT*)(srcptr2 + src2_index)
#define convertToDT noconvert
#define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index))
#define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index))
#if defined OP_ADD_SAT
#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2)
#elif defined OP_ADD
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
#elif defined OP_SUB_SAT
#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2)
#elif defined OP_SUB
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
#elif defined OP_RSUB_SAT
#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1)
#elif defined OP_RSUB
#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
#elif defined OP_ABSDIFF
#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2)
#elif defined OP_AND
#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
#elif defined OP_OR
#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
#elif defined OP_XOR
#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
#elif defined OP_NOT
#define PROCESS_ELEM dstelem = ~srcelem1
#elif defined OP_MIN
#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
#elif defined OP_MAX
#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
#elif defined OP_MUL
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
#elif defined OP_MUL_SCALE
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
#elif defined OP_DIV
#define PROCESS_ELEM \
workT e2 = srcelem2, zero = (workT)(0); \
dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
#elif defined OP_DIV_SCALE
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM \
workT e2 = srcelem2, zero = (workT)(0); \
dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
#elif defined OP_RECIP_SCALE
#define EXTRA_PARAMS , workT scale
#define PROCESS_ELEM \
workT e1 = srcelem1, zero = (workT)(0); \
dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
#elif defined OP_ADDW
#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
#elif defined OP_MAG
#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
#elif defined OP_PHASE_RADIANS
#define PROCESS_ELEM \
workT tmp = atan2(srcelem2, srcelem1); \
if(tmp < 0) tmp += 6.283185307179586232; \
dstelem = tmp
#elif defined OP_PHASE_DEGREES
#define PROCESS_ELEM \
workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
if(tmp < 0) tmp += 360; \
dstelem = tmp
#elif defined OP_EXP
#define PROCESS_ELEM dstelem = exp(srcelem1)
#elif defined OP_SQRT
#define PROCESS_ELEM dstelem = sqrt(srcelem1)
#elif defined OP_LOG
#define PROCESS_ELEM dstelem = log(abs(srcelem1))
#elif defined OP_CMP
#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
#elif defined OP_CONVERT
#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
#elif defined OP_CONVERT_SCALE
#define EXTRA_PARAMS , workT alpha, workT beta
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
#error "unknown op type"
#if defined UNARY_OP || defined MASK_UNARY_OP
#undef srcelem2
#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX
#define EXTRA_PARAMS , workT srcelem2
#if defined BINARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* srcptr2, int srcstep2, int srcoffset2,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
//printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem);
#elif defined MASK_BINARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* srcptr2, int srcstep2, int srcoffset2,
__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
#elif defined UNARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
#elif defined MASK_UNARY_OP
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols EXTRA_PARAMS )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
#error "Unknown operation type"
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the uintel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business uinterruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
typedef float2 cfloat;
inline cfloat cmulf(cfloat a, cfloat b)
return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
inline cfloat conjf(cfloat a)
return (cfloat)( a.x, - a.y );
__kernel void
__global const cfloat* a,
__global const cfloat* b,
float scale,
__global cfloat* dst,
uint cols,
uint rows,
uint mstep
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint idx = mad24(y, mstep / sizeof(cfloat), x);
if (x < cols && y < rows)
cfloat v = cmulf(a[idx], b[idx]);
dst[idx] = (cfloat)( v.x * scale, v.y * scale );
__kernel void
__global const cfloat* a,
__global const cfloat* b,
float scale,
__global cfloat* dst,
uint cols,
uint rows,
uint mstep
const uint x = get_global_id(0);
const uint y = get_global_id(1);
const uint idx = mad24(y, mstep / sizeof(cfloat), x);
if (x < cols && y < rows)
cfloat v = cmulf(a[idx], conjf(b[idx]));
dst[idx] = (cfloat)( v.x * scale, v.y * scale );
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Third party copyrights are property of their respective owners.
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the copyright holders or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int mask_index = mad24(y, maskstep, x + maskoffset);
if( mask[mask_index] )
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
int rows, int cols, dstT value )
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
*(dstT*)(dstptr + dst_index) = value;
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Shengen Yan,yanshengen@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#define FUNC(a, b) b += a;
#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
#define FUNC(a, b) b += a * a;
#error No sum function
/**************************************Array buffer SUM**************************************/
__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
__global srcT *src, __global dstT *dst)
unsigned int lid = get_local_id(0);
unsigned int gid = get_group_id(0);
unsigned int id = get_global_id(0);
unsigned int idx = offset + id + (id / cols) * invalid_cols;
__local dstT localmem_sum[128];
dstT sum = (dstT)(0), temp;
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
idx = offset + id + (id / cols) * invalid_cols;
temp = convertToDstT(src[idx]);
FUNC(temp, sum);
if (lid > 127)
localmem_sum[lid - 128] = sum;
if (lid < 128)
localmem_sum[lid] = sum + localmem_sum[lid];
for (int lsize = 64; lsize > 0; lsize >>= 1)
if (lid < lsize)
int lid2 = lsize + lid;
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
if (lid == 0)
dst[gid] = localmem_sum[0];
......@@ -67,6 +67,8 @@
#define GET_OPTIMIZED(func) (func)
#include "opencl_kernels.hpp"
namespace cv
......@@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 };
inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
if( sc.dims > 2 || !sc.isContinuous() )
return false;
Size sz = sc.size();
if(sz.width != 1 && sz.height != 1)
return false;
int cn = CV_MAT_CN(atype);
if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
return false;
return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
(sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
inline bool checkScalar(InputArray sc, int atype, int sckind, int akind)
if( sc.dims() > 2 || !sc.isContinuous() )
return false;
Size sz = sc.size();
if(sz.width != 1 && sz.height != 1)
return false;
int cn = CV_MAT_CN(atype);
if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
return false;
return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
(sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
(sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
......@@ -227,7 +246,10 @@ struct TLSData
static TLSData* get();
namespace ocl { MatAllocator* getOpenCLAllocator(); }
namespace ocl
MatAllocator* getOpenCLAllocator();
......@@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const
return hdr;
UMat::getStdAllocator()->allocate(u, accessFlags);
hdr.flags = flags;
setSize(hdr, dims, size.p, step.p);
hdr.u = u;
......@@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const
CV_Assert(u->data != 0);
Mat hdr(dims, size.p, type(), u->data + offset, step.p);
hdr.u = u;
hdr.datastart = hdr.data = u->data;
hdr.datastart = u->data;
hdr.data = hdr.datastart + offset;
hdr.datalimit = hdr.dataend = u->data + u->size;
CV_XADD(&hdr.u->refcount, 1);
return hdr;
......@@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const
void* dsthandle = dst.handle(ACCESS_WRITE);
if( srchandle == dsthandle && dst.offset == offset )
CV_Assert(u->currAllocator == dst.u->currAllocator);
u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
......@@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const
CV_Error(Error::StsNotImplemented, "");
UMat& UMat::setTo(InputArray _value, InputArray _mask)
bool haveMask = !_mask.empty();
int t = type(), cn = CV_MAT_CN(t);
if( dims <= 2 && cn <= 4 && ocl::useOpenCL() )
Mat value = _value.getMat();
CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
double buf[4];
convertAndUnrollScalar(value, t, (uchar*)buf, 1);
char opts[1024];
sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t));
ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
if( !setK.empty() )
ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t));
UMat mask;
if( haveMask )
mask = _mask.getUMat();
CV_Assert( mask.size() == size() && mask.type() == CV_8U );
ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this);
setK.args(maskarg, dstarg, scalararg);
ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this);
setK.args(dstarg, scalararg);
size_t globalsize[] = { cols, rows };
if( setK.run(2, globalsize, 0, false) )
return *this;
Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
m.setTo(_value, _mask);
return *this;
UMat& UMat::operator = (const Scalar&)
CV_Error(Error::StsNotImplemented, "");
......@@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat()
Mat a(100, 100, CV_16S), b;
Mat a(100, 100, CV_16SC2), b, c;
randu(a, Scalar::all(-100), Scalar::all(100));
Rect roi(1, 3, 10, 20);
Mat ra(a, roi), rb;
UMat ua, ura;
Rect roi(1, 3, 5, 4);
Mat ra(a, roi), rb, rc, rc0;
UMat ua, ura, ub, urb, uc, urc;
......@@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat()
CHECK_DIFF(ra, rb);
b = a.clone();
ra = a(roi);
rb = b(roi);
randu(b, Scalar::all(-100), Scalar::all(100));
urb = ub(roi);
/*std::cout << "==============================================\nbefore op (CPU):\n";
std::cout << "ra: " << ra << std::endl;
std::cout << "rb: " << rb << std::endl;*/
/*std::cout << "==============================================\nbefore op (GPU):\n";
std::cout << "ra: " << ra << std::endl;
std::cout << "rb: " << rb << std::endl;*/
cv::max(ra, rb, rc);
cv::max(ura, urb, urc);
/*std::cout << "==============================================\nafter op:\n";
std::cout << "rc: " << rc << std::endl;
std::cout << "rc0: " << rc0 << std::endl;*/
CHECK_DIFF(rc0, rc);
UMat tmp = rc0.getUMat(ACCESS_WRITE);
cv::max(ura, urb, tmp);
CHECK_DIFF(rc0, rc);
cv::max(urc, urb, urc);
CHECK_DIFF(rc0, rc);
rc = ra ^ rb;
cv::bitwise_xor(ura, urb, urc);
/*std::cout << "==============================================\nafter op:\n";
std::cout << "ra: " << rc0 << std::endl;
std::cout << "rc: " << rc << std::endl;*/
CHECK_DIFF(rc0, rc);
rc = ra + rb;
cv::add(ura, urb, urc);
CHECK_DIFF(rc0, rc);
cv::subtract(ra, Scalar::all(5), rc);
cv::subtract(ura, Scalar::all(5), urc);
CHECK_DIFF(rc0, rc);
catch (const test_excep& e)
......@@ -511,9 +511,10 @@ public:
CV_WRAP virtual void release();
CV_WRAP virtual bool grab();
CV_WRAP virtual bool retrieve(CV_OUT Mat& image, int flag = 0);
CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0);
virtual VideoCapture& operator >> (CV_OUT Mat& image);
CV_WRAP virtual bool read(CV_OUT Mat& image);
virtual VideoCapture& operator >> (CV_OUT UMat& image);
CV_WRAP virtual bool read(OutputArray image);
CV_WRAP virtual bool set(int propId, double value);
CV_WRAP virtual double get(int propId);
......@@ -515,7 +515,7 @@ bool VideoCapture::grab()
return cvGrabFrame(cap) != 0;
bool VideoCapture::retrieve(Mat& image, int channel)
bool VideoCapture::retrieve(OutputArray image, int channel)
IplImage* _img = cvRetrieveFrame(cap, channel);
if( !_img )
......@@ -533,7 +533,7 @@ bool VideoCapture::retrieve(Mat& image, int channel)
return true;
bool VideoCapture::read(Mat& image)
bool VideoCapture::read(OutputArray image)
......@@ -548,6 +548,12 @@ VideoCapture& VideoCapture::operator >> (Mat& image)
return *this;
VideoCapture& VideoCapture::operator >> (UMat& image)
return *this;
bool VideoCapture::set(int propId, double value)
return cvSetCaptureProperty(cap, propId, value) != 0;
......@@ -2687,6 +2687,124 @@ struct mRGBA2RGBA
static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bool ok = true;
UMat src = _src.getUMat(), dst;
Size sz = src.size(), dstSz = sz;
int scn = src.channels(), depth = src.depth(), bidx, dtype;
size_t globalsize[] = { src.cols, src.rows };
ocl::Kernel k;
if(depth != CV_8U && depth != CV_16U && depth != CV_32F)
return false;
switch (code)
case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
dtype = depth;
k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx));
CV_Assert(scn == 1);
dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
dtype = CV_MAKETYPE(depth, dcn);
k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=1 -D dcn=%d", depth, dcn));
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_RGB2YUV ? 0 : 2;
k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
if(dcn < 0) dcn = 3;
CV_Assert(dcn == 3 || dcn == 4);
bidx = code == COLOR_YUV2RGB ? 0 : 2;
k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
CV_Assert( scn == 1 );
CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
dcn = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
dstSz = Size(sz.width, sz.height * 2 / 3);
globalsize[0] = dstSz.height/2;
globalsize[1] = dstSz.width/2;
k.create("YUV2RGBA_NV12", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
CV_Assert(scn == 3 || scn == 4);
bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
if( !k.empty() )
_dst.create(dstSz, dtype);
dst = _dst.getUMat();
k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
ok = k.run(2, globalsize, 0, false);
return ok;
}//namespace cv
......@@ -2695,9 +2813,15 @@ struct mRGBA2RGBA
void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
int stype = _src.type();
int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
Mat src = _src.getMat(), dst;
Size sz = src.size();
int scn = src.channels(), depth = src.depth(), bidx;
CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
......@@ -1901,8 +1901,43 @@ private:
static bool ocl_resize( InputArray _src, OutputArray _dst,
double fx, double fy, int interpolation)
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if( !(cn <= 4 &&
(interpolation == INTER_NEAREST ||
(interpolation == INTER_LINEAR && (depth == CV_8U || depth == CV_32F)))) )
return false;
UMat src = _src.getUMat(), dst = _dst.getUMat();
ocl::Kernel k;
if (interpolation == INTER_LINEAR)
int wdepth = depth == CV_8U ? CV_32S : CV_32F;
int wtype = CV_MAKETYPE(wdepth, cn);
char buf[2][32];
k.create("resizeLN", ocl::imgproc::resize_oclsrc,
format("-D INTER_LINEAR -D depth=%s -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
else if (interpolation == INTER_NEAREST)
k.create("resizeNN", ocl::imgproc::resize_oclsrc,
format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) ));
if( k.empty() )
return false;
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
(float)(1./fx), (float)(1./fy));
size_t globalsize[] = { dst.cols, dst.rows };
return k.run(2, globalsize, 0, false);
......@@ -2013,25 +2048,28 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
resizeArea_<double, double>, 0
Mat src = _src.getMat();
Size ssize = src.size();
Size ssize = _src.size();
CV_Assert( ssize.area() > 0 );
CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
if( !dsize.area() )
dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
CV_Assert( dsize.area() );
inv_scale_x = (double)dsize.width/src.cols;
inv_scale_y = (double)dsize.height/src.rows;
inv_scale_x = (double)dsize.width/ssize.width;
inv_scale_y = (double)dsize.height/ssize.height;
_dst.create(dsize, src.type());
Mat dst = _dst.getMat();
_dst.create(dsize, _src.type());
if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
ocl_resize(_src, _dst, inv_scale_x, inv_scale_y, interpolation) )
Mat src = _src.getMat(), dst = _dst.getMat();
if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Rock Li, Rock.li@amd.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
__kernel void bilateral_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
int x = get_global_id(0);
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float sum = 0.f, wsum = 0.f;
int val0 = (int)src[src_index];
for(int k = 0; k < maxk; k++ )
int val = (int)src[src_index + space_ofs[k]];
float w = space_weight[k] * color_weight[abs(val - val0)];
sum += (float)(val) * w;
wsum += w;
dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
__kernel void bilateral2_C1_D0(__global uchar *dst,
__global const uchar *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
int x = get_global_id(0) << 2;
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)(0.f), wsum = (float4)(0.f);
int4 val0 = convert_int4(vload4(0,src + src_index));
for(int k = 0; k < maxk; k++ )
int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
sum += convert_float4(val) * w;
wsum += w;
*(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
__kernel void bilateral_C4_D0(__global uchar4 *dst,
__global const uchar4 *src,
const int dst_rows,
const int dst_cols,
const int maxk,
const int radius,
const int dst_step,
const int dst_offset,
const int src_step,
const int src_rows,
const int src_cols,
__constant float *color_weight,
__constant float *space_weight,
__constant int *space_ofs)
int x = get_global_id(0);
int y = get_global_id(1);
if (y < dst_rows && x < dst_cols)
int src_index = mad24(y + radius, src_step, x + radius);
int dst_index = mad24(y, dst_step, x + dst_offset);
float4 sum = (float4)0.f;
float wsum = 0.f;
int4 val0 = convert_int4(src[src_index]);
for(int k = 0; k < maxk; k++ )
int4 val = convert_int4(src[src_index + space_ofs[k]]);
float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
sum += convert_float4(val) * (float4)w;
wsum += w;
wsum = 1.f / wsum;
dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Zhang Ying, zhangying913@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
/////////////////////////////////Macro for border type////////////////////////////////////////////
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
//blur function does not support BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#define THREADS 256
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
uint4 tmp_sum = 0;
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
int posY = (get_group_id(1) << 1);
for(int i=-anX; i<=anX; i++)
tmp_sum += vload4(get_local_id(0), temp+i);
if(posY < dst_rows && posX < dst_cols)
tmp_sum /= (uint4) alpha;
if(posX >= 0 && posX < dst_cols)
*(dst) = tmp_sum.x;
if(posX+1 >= 0 && posX+1 < dst_cols)
*(dst + 1) = tmp_sum.y;
if(posX+2 >= 0 && posX+2 < dst_cols)
*(dst + 2) = tmp_sum.z;
if(posX+3 >= 0 && posX+3 < dst_cols)
*(dst + 3) = tmp_sum.w;
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
int dst_rows, int dst_cols,
int dst_startX, int dst_x_off,
float alpha)
if(get_local_id(0) >= (THREADS-ksX+1))
int posX = dst_startX - dst_x_off + get_local_id(0);
int posY = (get_group_id(1) << 1);
uint4 temp_sum = 0;
for(int i=-anX; i<=anX; i++)
temp_sum += temp[get_local_id(0) + anX + i];
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
*dst = convert_uchar4(convert_float4(temp_sum)/alpha);
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = src_offset % src_step;
int src_y_off = src_offset / src_step;
int dst_x_off = dst_offset % dst_step;
int dst_y_off = dst_offset / dst_step;
int head_off = dst_x_off%4;
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
for(int i=0; i < ksY+1; i++)
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
int not_all_in_range;
for(int i=0; i < ksY+1; i++)
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
| (startY+i<0) | (startY+i>src_whole_rows-1);
int selected_row;
int4 selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
data[i].x = *(src + selected_row * src_step + selected_col.x);
data[i].y = *(src + selected_row * src_step + selected_col.y);
data[i].z = *(src + selected_row * src_step + selected_col.z);
data[i].w = *(src + selected_row * src_step + selected_col.w);
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
tmp_sum += (data[i]);
int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 2;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
uint4 data[ksY+1];
__local uint4 temp[2][THREADS];
bool con;
for(int i=0; i < ksY+1; i++)
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
for(int i=0; i < ksY+1; i++)
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
uint4 tmp_sum = 0;
for(int i=1; i < ksY; i++)
tmp_sum += (data[i]);
int index = dst_startY * (dst_step>>2)+ dst_startX + col;
temp[0][col] = tmp_sum + (data[0]);
temp[1][col] = tmp_sum + (data[ksY]);
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 2;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 2;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float data[ksY+1];
__local float temp[2][THREADS];
bool con;
float ss;
for(int i=0; i < ksY+1; i++)
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
data[i] = con ? ss : 0.f;
for(int i=0; i < ksY+1; i++)
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>2) + selected_col];
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
sum0 += (data[i]);
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
temp[0][col] = sum1;
temp[1][col] = sum2;
if(col < (THREADS-(ksX-1)))
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float tmp_sum[2]= {0.0, 0.0};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
tmp_sum[k] += temp[k][col+i];
for(int i=0; i<2; i++)
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
int dst_offset, int dst_rows, int dst_cols, int dst_step
int col = get_local_id(0);
const int gX = get_group_id(0);
const int gY = get_group_id(1);
int src_x_off = (src_offset % src_step) >> 4;
int src_y_off = src_offset / src_step;
int dst_x_off = (dst_offset % dst_step) >> 4;
int dst_y_off = dst_offset / dst_step;
int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
int startY = (gY << 1) - anY + src_y_off;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
float4 data[ksY+1];
__local float4 temp[2][THREADS];
bool con;
float4 ss;
for(int i=0; i < ksY+1; i++)
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
int cur_col = clamp(startX + col, 0, src_whole_cols);
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
for(int i=0; i < ksY+1; i++)
int selected_row;
int selected_col;
selected_row = ADDR_H(startY+i, 0, src_whole_rows);
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
selected_col = ADDR_L(startX+col, 0, src_whole_cols);
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
data[i] = src[selected_row * (src_step>>4) + selected_col];
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
for(int i=1; i < ksY; i++)
sum0 += (data[i]);
sum1 = sum0 + (data[0]);
sum2 = sum0 + (data[ksY]);
temp[0][col] = sum1;
temp[1][col] = sum2;
if(col < (THREADS-(ksX-1)))
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gY << 1);
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
for(int k=0; k<2; k++)
for(int i=-anX; i<=anX; i++)
tmp_sum[k] += temp[k][col+i];
for(int i=0; i<2; i++)
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#ifdef L2GRAD
inline float calc(int x, int y)
return sqrt((float)(x * x + y * y));
inline float calc(int x, int y)
return (float)abs(x) + abs(y);
#endif //
// Smoothing perpendicular to the derivative direction with a triangle filter
// only support 3x3 Sobel kernel
// h (-1) = 1, h (0) = 2, h (1) = 1
// h'(-1) = -1, h'(0) = 0, h'(1) = 1
// thus sobel 2D operator can be calculated as:
// h'(x, y) = h'(x)h(y) for x direction
// src input 8bit single channel image data
// dx_buf output dx buffer
// dy_buf output dy buffer
__global const uchar * src,
__global int * dx_buf,
__global int * dy_buf,
int rows,
int cols,
int src_step,
int src_offset,
int dx_buf_step,
int dx_buf_offset,
int dy_buf_step,
int dy_buf_offset
dx_buf_step /= sizeof(*dx_buf);
dx_buf_offset /= sizeof(*dx_buf);
dy_buf_step /= sizeof(*dy_buf);
dy_buf_offset /= sizeof(*dy_buf);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local int smem[16][18];
smem[lidy][lidx + 1] =
src[gidx + min(gidy, rows - 1) * src_step + src_offset];
if(lidx == 0)
smem[lidy][0] =
src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset];
smem[lidy][17] =
src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
if(gidy < rows && gidx < cols)
dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
-smem[lidy][lidx] + smem[lidy][lidx + 2];
dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
// calculate the magnitude of the filter pass combining both x and y directions
// This is the buffered version(3x3 sobel)
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
// dy direvitive in y direction output
// mag magnitude direvitive of xy output
__global const int * dx_buf,
__global const int * dy_buf,
__global int * dx,
__global int * dy,
__global float * mag,
int rows,
int cols,
int dx_buf_step,
int dx_buf_offset,
int dy_buf_step,
int dy_buf_offset,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset
dx_buf_step /= sizeof(*dx_buf);
dx_buf_offset /= sizeof(*dx_buf);
dy_buf_step /= sizeof(*dy_buf);
dy_buf_offset /= sizeof(*dy_buf);
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
__local int sdx[18][16];
__local int sdy[18][16];
sdx[lidy + 1][lidx] =
dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
sdy[lidy + 1][lidx] =
dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
if(lidy == 0)
sdx[0][lidx] =
dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
sdx[17][lidx] =
dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
sdy[0][lidx] =
dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
sdy[17][lidx] =
dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
if(gidx < cols && gidy < rows)
int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
dx[gidx + gidy * dx_step + dx_offset] = x;
dy[gidx + gidy * dy_step + dy_offset] = y;
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
// calculate the magnitude of the filter pass combining both x and y directions
// This is the non-buffered version(non-3x3 sobel)
// dx_buf dx buffer, calculated from calcSobelRowPass
// dy_buf dy buffer, calculated from calcSobelRowPass
// dx direvitive in x direction output
// dy direvitive in y direction output
// mag magnitude direvitive of xy output
void calcMagnitude
__global const int * dx,
__global const int * dy,
__global float * mag,
int rows,
int cols,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidy < rows && gidx < cols)
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
dx[gidx + gidy * dx_step + dx_offset],
dy[gidx + gidy * dy_step + dy_offset]
// 0.4142135623730950488016887242097 is tan(22.5)
#define CANNY_SHIFT 15
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
//First pass of edge detection and non-maximum suppression
// edgetype is set to for each pixel:
// 0 - below low thres, not an edge
// 1 - maybe an edge
// 2 - is an edge, either magnitude is greater than high thres, or
// Given estimates of the image gradients, a search is then carried out
// to determine if the gradient magnitude assumes a local maximum in the gradient direction.
// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
// if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
// if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
// dx, dy direvitives of x and y direction
// mag magnitudes calculated from calcMagnitude function
// map output containing raw edge types
__global const int * dx,
__global const int * dy,
__global const float * mag,
__global int * map,
int rows,
int cols,
float low_thresh,
float high_thresh,
int dx_step,
int dx_offset,
int dy_step,
int dy_offset,
int mag_step,
int mag_offset,
int map_step,
int map_offset
dx_step /= sizeof(*dx);
dx_offset /= sizeof(*dx);
dy_step /= sizeof(*dy);
dy_offset /= sizeof(*dy);
mag_step /= sizeof(*mag);
mag_offset /= sizeof(*mag);
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
mag += mag_offset;
map += map_offset;
__local float smem[18][18];
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_global_id(0) & 0xFFFFF0;
int grp_idy = get_global_id(1) & 0xFFFFF0;
int tid = lidx + lidy * 16;
int lx = tid % 18;
int ly = tid / 18;
if(ly < 14)
smem[ly][lx] =
mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
smem[ly + 14][lx] =
mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
if(gidy < rows && gidx < cols)
int x = dx[gidx + gidy * dx_step];
int y = dy[gidx + gidy * dy_step];
const int s = (x ^ y) < 0 ? -1 : 1;
const float m = smem[lidy + 1][lidx + 1];
x = abs(x);
y = abs(y);
// 0 - the pixel can not belong to an edge
// 1 - the pixel might belong to an edge
// 2 - the pixel does belong to an edge
int edge_type = 0;
if(m > low_thresh)
const int tg22x = x * TG22;
const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
if(y < tg22x)
if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
edge_type = 1 + (int)(m > high_thresh);
else if (y > tg67x)
if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
edge_type = 1 + (int)(m > high_thresh);
if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
edge_type = 1 + (int)(m > high_thresh);
map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
#undef TG22
// do Hysteresis for pixel whose edge type is 1
// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
// marked as edge. Each thread will iterate for 16 times to connect local edges.
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
// map raw edge type results calculated from calcMap.
// st the potiential edge points found in this kernel call
// counter the number of potiential edge points
__global int * map,
__global ushort2 * st,
__global unsigned int * counter,
int rows,
int cols,
int map_step,
int map_offset
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
__local int smem[18][18];
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_global_id(0) & 0xFFFFF0;
int grp_idy = get_global_id(1) & 0xFFFFF0;
int tid = lidx + lidy * 16;
int lx = tid % 18;
int ly = tid / 18;
if(ly < 14)
smem[ly][lx] =
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
smem[ly + 14][lx] =
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
if(gidy < rows && gidx < cols)
int n;
#pragma unroll
for (int k = 0; k < 16; ++k)
n = 0;
if (smem[lidy + 1][lidx + 1] == 1)
n += smem[lidy ][lidx ] == 2;
n += smem[lidy ][lidx + 1] == 2;
n += smem[lidy ][lidx + 2] == 2;
n += smem[lidy + 1][lidx ] == 2;
n += smem[lidy + 1][lidx + 2] == 2;
n += smem[lidy + 2][lidx ] == 2;
n += smem[lidy + 2][lidx + 1] == 2;
n += smem[lidy + 2][lidx + 2] == 2;
if (n > 0)
smem[lidy + 1][lidx + 1] = 2;
const int e = smem[lidy + 1][lidx + 1];
map[gidx + 1 + (gidy + 1) * map_step] = e;
n = 0;
if(e == 2)
n += smem[lidy ][lidx ] == 1;
n += smem[lidy ][lidx + 1] == 1;
n += smem[lidy ][lidx + 2] == 1;
n += smem[lidy + 1][lidx ] == 1;
n += smem[lidy + 1][lidx + 2] == 1;
n += smem[lidy + 2][lidx ] == 1;
n += smem[lidy + 2][lidx + 1] == 1;
n += smem[lidy + 2][lidx + 2] == 1;
if(n > 0)
unsigned int ind = atomic_inc(counter);
st[ind] = (ushort2)(gidx + 1, gidy + 1);
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1};
__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
#define stack_size 512
__global int * map,
__global ushort2 * st1,
__global ushort2 * st2,
__global int * counter,
int rows,
int cols,
int count,
int map_step,
int map_offset
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
map += map_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int lidx = get_local_id(0);
int lidy = get_local_id(1);
int grp_idx = get_group_id(0);
int grp_idy = get_group_id(1);
__local unsigned int s_counter;
__local unsigned int s_ind;
__local ushort2 s_st[stack_size];
if(lidx == 0)
s_counter = 0;
int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
if(ind < count)
ushort2 pos = st1[ind];
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
if (lidx < 8)
pos.x += c_dx[lidx];
pos.y += c_dy[lidx];
if (map[pos.x + pos.y * map_step] == 1)
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
s_st[ind] = pos;
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
const int subTaskIdx = lidx >> 3;
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
pos.x = pos.y = 0;
if (subTaskIdx < portion)
pos = s_st[s_counter - 1 - subTaskIdx];
if (lidx == 0)
s_counter -= portion;
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
pos.x += c_dx[lidx & 7];
pos.y += c_dy[lidx & 7];
if (map[pos.x + pos.y * map_step] == 1)
map[pos.x + pos.y * map_step] = 2;
ind = atomic_inc(&s_counter);
s_st[ind] = pos;
if (s_counter > 0)
if (lidx == 0)
ind = atomic_add(counter, s_counter);
s_ind = ind - s_counter;
ind = s_ind;
for (int i = lidx; i < s_counter; i += get_local_size(0))
st2[ind + i] = s_st[i];
#undef stack_size
//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
// map edge type mappings
// dst edge output
void getEdges
__global const int * map,
__global uchar * dst,
int rows,
int cols,
int map_step,
int map_offset,
int dst_step,
int dst_offset
map_step /= sizeof(*map);
map_offset /= sizeof(*map);
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidy < rows && gidx < cols)
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Sen Liu, swjtuls1987@126.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
#ifndef WAVE_SIZE
#define WAVE_SIZE 1
int calc_lut(__local int* smem, int val, int tid)
smem[tid] = val;
if (tid == 0)
for (int i = 1; i < 256; ++i)
smem[i] += smem[i - 1];
return smem[tid];
#ifdef CPU
void reduce(volatile __local int* smem, int val, int tid)
smem[tid] = val;
if (tid < 128)
smem[tid] = val += smem[tid + 128];
if (tid < 64)
smem[tid] = val += smem[tid + 64];
if (tid < 32)
smem[tid] += smem[tid + 32];
if (tid < 16)
smem[tid] += smem[tid + 16];
if (tid < 8)
smem[tid] += smem[tid + 8];
if (tid < 4)
smem[tid] += smem[tid + 4];
if (tid < 2)
smem[tid] += smem[tid + 2];
if (tid < 1)
smem[256] = smem[tid] + smem[tid + 1];
void reduce(__local volatile int* smem, int val, int tid)
smem[tid] = val;
if (tid < 128)
smem[tid] = val += smem[tid + 128];
if (tid < 64)
smem[tid] = val += smem[tid + 64];
if (tid < 32)
smem[tid] += smem[tid + 32];
#if WAVE_SIZE < 32
if (tid < 16)
smem[tid] += smem[tid + 16];
#if WAVE_SIZE < 16
if (tid < 8)
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
const int srcStep, const int dstStep,
const int2 tileSize, const int tilesX,
const int clipLimit, const float lutScale,
const int src_offset, const int dst_offset)
__local int smem[512];
const int tx = get_group_id(0);
const int ty = get_group_id(1);
const unsigned int tid = get_local_id(1) * get_local_size(0)
+ get_local_id(0);
smem[tid] = 0;
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
const int data = srcPtr[j];
int tHistVal = smem[tid];
if (clipLimit > 0)
// clip histogram bar
int clipped = 0;
if (tHistVal > clipLimit)
clipped = tHistVal - clipLimit;
tHistVal = clipLimit;
// find number of overall clipped samples
reduce(smem, clipped, tid);
#ifdef CPU
clipped = smem[256];
clipped = smem[0];
// broadcast evaluated value
__local int totalClipped;
if (tid == 0)
totalClipped = clipped;
// redistribute clipped samples evenly
int redistBatch = totalClipped / 256;
tHistVal += redistBatch;
int residual = totalClipped - redistBatch * 256;
if (tid < residual)
const int lutVal = calc_lut(smem, tHistVal, tid);
uint ires = (uint)convert_int_rte(lutScale * lutVal);
lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
convert_uchar(clamp(ires, (uint)0, (uint)255));
__kernel void transform(__global __const uchar * src,
__global uchar * dst,
__global uchar * lut,
const int srcStep, const int dstStep, const int lutStep,
const int cols, const int rows,
const int2 tileSize,
const int tilesX, const int tilesY,
const int src_offset, const int dst_offset, int lut_offset)
const int x = get_global_id(0);
const int y = get_global_id(1);
if (x >= cols || y >= rows)
const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
int ty1 = convert_int_rtn(tyf);
int ty2 = ty1 + 1;
const float ya = tyf - ty1;
ty1 = max(ty1, 0);
ty2 = min(ty2, tilesY - 1);
const float txf = (convert_float(x) / tileSize.x) - 0.5f;
int tx1 = convert_int_rtn(txf);
int tx2 = tx1 + 1;
const float xa = txf - tx1;
tx1 = max(tx1, 0);
tx2 = min(tx2, tilesX - 1);
const int srcVal = src[mad24(y, srcStep, x + src_offset)];
float res = 0;
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
uint ires = (uint)convert_int_rte(res);
dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// @Authors
// Jiang Liyuan, jlyuan001.good@163.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
/************************************** convolve **************************************/
__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
int src_offset, int dst_offset, int koffset)
__local float smem[16 + 2 * 8][16 + 2 * 8];
int x = get_local_id(0);
int y = get_local_id(1);
int gx = get_global_id(0);
int gy = get_global_id(1);
// x | x 0 | 0
// -----------
// x | x 0 | 0
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// x | x 0 | 0
// -----------
// x | x 0 | 0
smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
// 0 | 0 0 | 0
// -----------
// 0 | 0 0 | 0
// 0 | 0 x | x
// -----------
// 0 | 0 x | x
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
if (gx < cols && gy < rows)
float res = 0;
for (int i = 0; i < kHeight; ++i)
for (int j = 0; j < kWidth; ++j)
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
dst[gy * dst_step + gx + dst_offset] = res;
......@@ -48,6 +48,8 @@
#include "opencv2/imgproc/imgproc_c.h"
#include "opencv2/core/private.hpp"
#include "opencv2/core/ocl.hpp"
#include "opencl_kernels.hpp"
#include <math.h>
#include <assert.h>
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
// License Agreement
// For Open Source Computer Vision Library
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
#include "test_precomp.hpp"
#include <string>
using namespace cv;
using namespace std;
class CV_ImgprocUMatTest : public cvtest::BaseTest
CV_ImgprocUMatTest() {}
~CV_ImgprocUMatTest() {}
void run(int)
string imgpath = string(ts->get_data_path()) + "shared/lena.png";
Mat img = imread(imgpath, 1), gray, smallimg, result;
UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult;
cvtColor(img, gray, COLOR_BGR2GRAY);
resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR);
equalizeHist(smallimg, result);
cvtColor(uimg, ugray, COLOR_BGR2GRAY);
resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR);
equalizeHist(usmallimg, uresult);
imshow("orig", uimg);
imshow("small", usmallimg);
imshow("equalized gray", uresult);
destroyWindow("equalized gray");
TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册