From 485d36d3c0182fe7a4cecb094bf319cfc4597b3c Mon Sep 17 00:00:00 2001 From: Vadim Pisarevsky Date: Fri, 25 Oct 2013 16:46:03 +0400 Subject: [PATCH] changed InputArray's enumeration constant for UMat, since it may potentially conflict with existing OpenCL module. refactored Kernel's setArg API (now UMat is passed to a kernel as a structure, as Alexander A suggested). removed Kernel's cleanup callback from the external API; now each kernel keeps track of used matrices and they are dereferenced after it's complete. --- modules/core/include/opencv2/core/mat.hpp | 6 +- modules/core/include/opencv2/core/ocl.hpp | 152 ++++++++++------------ modules/core/src/ocl.cpp | 125 +++++++++++++----- 3 files changed, 170 insertions(+), 113 deletions(-) diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp index 81694b69e2..4df2432aeb 100644 --- a/modules/core/include/opencv2/core/mat.hpp +++ b/modules/core/include/opencv2/core/mat.hpp @@ -83,9 +83,9 @@ public: CUDA_MEM = 8 << KIND_SHIFT, GPU_MAT = 9 << KIND_SHIFT, OCL_MAT =10 << KIND_SHIFT, - UMAT =OCL_MAT, - STD_VECTOR_UMAT =11 << KIND_SHIFT, - UEXPR =12 << KIND_SHIFT + UMAT =11 << KIND_SHIFT, + STD_VECTOR_UMAT =12 << KIND_SHIFT, + UEXPR =13 << KIND_SHIFT }; _InputArray(); diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp index 5f5483480a..419ccffd5b 100644 --- a/modules/core/include/opencv2/core/ocl.hpp +++ b/modules/core/include/opencv2/core/ocl.hpp @@ -264,13 +264,6 @@ public: class CV_EXPORTS Kernel { public: - class CV_EXPORTS Callback - { - public: - virtual ~Callback() {} - virtual void operator()() = 0; - }; - Kernel(); Kernel(const char* kname, const Program& prog); Kernel(const char* kname, const ProgramSource& prog, @@ -283,118 +276,115 @@ public: bool create(const char* kname, const ProgramSource& prog, const String& buildopts, String& errmsg); - int set(int i, const void* value, size_t sz); - int set(int i, const UMat& m); - int set(int i, const KernelArg& arg); - template int set(int i, const _Tp& value) + void set(int i, const void* value, size_t sz); + void set(int i, const UMat& m); + void set(int i, const KernelArg& arg); + template void set(int i, const _Tp& value) { return set(i, &value, sizeof(value)); } - template - Kernel& args(_Tp1 a1) + template + Kernel& args(const _Tp0& a0) { - set(0, a1); return *this; + set(0, a0); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1) { - int i = set(0, a1); set(i, a2); return *this; + set(0, a0); set(1, a1); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2) { - int i = set(0, a1); i = set(i, a2); set(i, a3); return *this; + set(0, a0); set(1, a1); set(2, a2); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); set(i, a4); - return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, + const _Tp3& a3, const _Tp4& a4) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); set(i, a5); - return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, + const _Tp3& a3, const _Tp4& a4, const _Tp5& a5) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); set(i, a6); return *this; + set(0, a0); set(1, a1); set(2, a2); + set(3, a3); set(4, a4); set(5, a5); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); set(i, a7); return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); + set(4, a4); set(5, a5); set(6, a6); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); - return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); + set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8, _Tp9 a9) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7, + const _Tp8& a8) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8); - set(i, a9); return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); + set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, - _Tp8 a8, _Tp9 a9, _Tp10 a10) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7, + const _Tp8& a8, const _Tp9& a9) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8); - i = set(i, a9); set(i, a10); return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5); + set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, - _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7, + const _Tp8& a8, const _Tp9& a9, const _Tp10& a10) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8); - i = set(i, a9); i = set(i, a10); set(i, a11); return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5); + set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this; } - template - Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, - _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11, _Tp12 a12) + template + Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3, + const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7, + const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11) { - int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); - i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8); - i = set(i, a9); i = set(i, a10); i = set(i, a11); set(i, a12); - return *this; + set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5); + set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this; } - void run(int dims, size_t offset[], - size_t globalsize[], size_t localsize[], bool sync, - const Ptr& cleanupCallback=Ptr(), - const Queue& q=Queue()); - void runTask(bool sync, - const Ptr& cleanupCallback=Ptr(), - const Queue& q=Queue()); + void run(int dims, size_t offset[], size_t globalsize[], + size_t localsize[], bool sync, const Queue& q=Queue()); + void runTask(bool sync, const Queue& q=Queue()); size_t workGroupSize() const; bool compileWorkGroupSize(size_t wsz[]) const; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 3ab3db7c3f..094a80d974 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -1210,6 +1210,46 @@ OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event)) namespace cv { namespace ocl { +struct UMat2D +{ + UMat2D(const UMat& m, int accessFlags) + { + CV_Assert(m.dims == 2); + data = (cl_mem)m.handle(accessFlags); + offset = m.offset; + step = m.step; + rows = m.rows; + cols = m.cols; + } + cl_mem data; + size_t offset; + size_t step; + int rows; + int cols; +}; + +struct UMat3D +{ + UMat3D(const UMat& m, int accessFlags) + { + CV_Assert(m.dims == 3); + data = (cl_mem)m.handle(accessFlags); + offset = m.offset; + step = m.step.p[1]; + slicestep = m.step.p[0]; + slices = m.size.p[0]; + rows = m.size.p[1]; + cols = m.size.p[2]; + } + cl_mem data; + size_t offset; + size_t slicestep; + size_t step; + int slices; + int rows; + int cols; +}; + // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 ) { @@ -1266,6 +1306,15 @@ bool useOpenCL() return data->useOpenCL > 0; } +void setUseOpenCL(bool flag) +{ + if( haveOpenCL() ) + { + TLSData* data = TLSData::get(); + data->useOpenCL = flag ? 1 : 0; + } +} + void finish() { Queue::getDefault().finish(); @@ -1980,10 +2029,33 @@ struct Kernel::Impl cl_int retval = 0; handle = ph != 0 ? clCreateKernel(ph, kname, &retval) : 0; + for( int i = 0; i < MAX_ARRS; i++ ) + u[i] = 0; + } + + void cleanupUMats() + { + for( int i = 0; i < MAX_ARRS; i++ ) + if( u[i] ) + { + if( CV_XADD(&u[i]->urefcount, -1) == 1 ) + u[i]->currAllocator->deallocate(u[i]); + u[i] = 0; + } + nu = 0; + } + + void addUMat(const UMat& m) + { + CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0); + u[nu] = m.u; + CV_XADD(&m.u->urefcount, 1); + nu++; } + void finit() { - if(!f.empty()) f->operator()(); + cleanupUMats(); if(e) { clReleaseEvent(e); e = 0; } release(); } @@ -1998,7 +2070,9 @@ struct Kernel::Impl cl_kernel handle; cl_event e; - Ptr f; + enum { MAX_ARRS = 16 }; + UMatData* u[MAX_ARRS]; + int nu; }; }} @@ -2086,51 +2160,48 @@ void* Kernel::ptr() const return p ? p->handle : 0; } -int Kernel::set(int i, const void* value, size_t sz) +void Kernel::set(int i, const void* value, size_t sz) { CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 ); - return i+1; + if( i == 0 ) + p->cleanupUMats(); } -int Kernel::set(int i, const UMat& m) +void Kernel::set(int i, const UMat& m) { - return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); + set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); } -int Kernel::set(int i, const KernelArg& arg) +void Kernel::set(int i, const KernelArg& arg) { CV_Assert( p && p->handle ); + if( i == 0 ) + p->cleanupUMats(); if( arg.m ) { - int dims = arg.m->dims; - void* h = arg.m->handle(((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) + - ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0)); - clSetKernelArg(p->handle, (cl_uint)i, sizeof(cl_mem), &h); - clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(size_t), &arg.m->offset); - if( dims <= 2 ) + int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) + + ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0); + if( arg.m->dims <= 2 ) { - clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(size_t), &arg.m->step.p[0]); - clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(arg.m->rows), &arg.m->rows); - clSetKernelArg(p->handle, (cl_uint)(i+4), sizeof(arg.m->cols), &arg.m->cols); - return i + 5; + UMat2D u2d(*arg.m, accessFlags); + clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d); } else { - clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(size_t)*(dims-1), &arg.m->step.p[0]); - clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(cl_int)*dims, &arg.m->size.p[0]); - return i + 4; + UMat3D u3d(*arg.m, accessFlags); + clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d); } + p->addUMat(*arg.m); } else { clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj); - return i+1; } } void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[], - bool sync, const Ptr& cleanupCallback, const Queue& q) + bool sync, const Queue& q) { CV_Assert(p && p->handle && p->e == 0); cl_command_queue qq = getQueue(q); @@ -2140,18 +2211,16 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz if( sync ) { clFinish(qq); - if( !cleanupCallback.empty() ) - cleanupCallback->operator()(); + p->cleanupUMats(); } else { - p->f = cleanupCallback; p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } } -void Kernel::runTask(bool sync, const Ptr& cleanupCallback, const Queue& q) +void Kernel::runTask(bool sync, const Queue& q) { CV_Assert(p && p->handle && p->e == 0); cl_command_queue qq = getQueue(q); @@ -2159,12 +2228,10 @@ void Kernel::runTask(bool sync, const Ptr& cleanupCallback, const Queu if( sync ) { clFinish(qq); - if( !cleanupCallback.empty() ) - cleanupCallback->operator()(); + p->cleanupUMats(); } else { - p->f = cleanupCallback; p->addref(); clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p); } -- GitLab