From 485d36d3c0182fe7a4cecb094bf319cfc4597b3c Mon Sep 17 00:00:00 2001
From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Fri, 25 Oct 2013 16:46:03 +0400
Subject: [PATCH] changed InputArray's enumeration constant for UMat, since it
 may potentially conflict with existing OpenCL module. refactored Kernel's
 setArg API (now UMat is passed to a kernel as a structure, as Alexander A
 suggested). removed Kernel's cleanup callback from the external API; now each
 kernel keeps track of used matrices and they are dereferenced after it's
 complete.

---
 modules/core/include/opencv2/core/mat.hpp |   6 +-
 modules/core/include/opencv2/core/ocl.hpp | 152 ++++++++++------------
 modules/core/src/ocl.cpp                  | 125 +++++++++++++-----
 3 files changed, 170 insertions(+), 113 deletions(-)
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index 81694b69e2..4df2432aeb 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -83,9 +83,9 @@ public:
         CUDA_MEM          = 8 << KIND_SHIFT,
         GPU_MAT           = 9 << KIND_SHIFT,
         OCL_MAT           =10 << KIND_SHIFT,
-        UMAT              =OCL_MAT,
-        STD_VECTOR_UMAT   =11 << KIND_SHIFT,
-        UEXPR             =12 << KIND_SHIFT
+        UMAT              =11 << KIND_SHIFT,
+        STD_VECTOR_UMAT   =12 << KIND_SHIFT,
+        UEXPR             =13 << KIND_SHIFT
     };
 
     _InputArray();
diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
index 5f5483480a..419ccffd5b 100644
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -264,13 +264,6 @@ public:
 class CV_EXPORTS Kernel
 {
 public:
-    class CV_EXPORTS Callback
-    {
-    public:
-        virtual ~Callback() {}
-        virtual void operator()() = 0;
-    };
-
     Kernel();
     Kernel(const char* kname, const Program& prog);
     Kernel(const char* kname, const ProgramSource& prog,
@@ -283,118 +276,115 @@ public:
     bool create(const char* kname, const ProgramSource& prog,
                 const String& buildopts, String& errmsg);
 
-    int set(int i, const void* value, size_t sz);
-    int set(int i, const UMat& m);
-    int set(int i, const KernelArg& arg);
-    template<typename _Tp> int set(int i, const _Tp& value)
+    void set(int i, const void* value, size_t sz);
+    void set(int i, const UMat& m);
+    void set(int i, const KernelArg& arg);
+    template<typename _Tp> void set(int i, const _Tp& value)
     { return set(i, &value, sizeof(value)); }
 
-    template<typename _Tp1>
-    Kernel& args(_Tp1 a1)
+    template<typename _Tp0>
+    Kernel& args(const _Tp0& a0)
     {
-        set(0, a1); return *this;
+        set(0, a0); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2>
-    Kernel& args(_Tp1 a1, _Tp2 a2)
+    template<typename _Tp0, typename _Tp1>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1)
     {
-        int i = set(0, a1); set(i, a2); return *this;
+        set(0, a0); set(1, a1); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3)
+    template<typename _Tp0, typename _Tp1, typename _Tp2>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
     {
-        int i = set(0, a1); i = set(i, a2); set(i, a3); return *this;
+        set(0, a0); set(1, a1); set(2, a2); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); set(i, a4);
-        return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
+                 const _Tp3& a3, const _Tp4& a4)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); set(i, a5);
-        return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3,
-             typename _Tp4, typename _Tp5, typename _Tp6>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6)
+    template<typename _Tp0, typename _Tp1, typename _Tp2,
+             typename _Tp3, typename _Tp4, typename _Tp5>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
+                 const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); set(i, a6); return *this;
+        set(0, a0); set(1, a1); set(2, a2);
+        set(3, a3); set(4, a4); set(5, a5); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
-             typename _Tp5, typename _Tp6, typename _Tp7>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
+             typename _Tp4, typename _Tp5, typename _Tp6>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
+        set(4, a4); set(5, a5); set(6, a6); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
-             typename _Tp5, typename _Tp6, typename _Tp7, typename _Tp8>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
+             typename _Tp4, typename _Tp5, typename _Tp6, typename _Tp7>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8);
-        return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
+        set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
-             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8, _Tp9 a9)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
+             typename _Tp5, typename _Tp6, typename _Tp7, typename _Tp8>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
+                 const _Tp8& a8)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
-        set(i, a9); return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
+        set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
-             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9, typename _Tp10>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
-                 _Tp8 a8, _Tp9 a9, _Tp10 a10)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
+             typename _Tp5, typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
+                 const _Tp8& a8, const _Tp9& a9)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
-        i = set(i, a9); set(i, a10); return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
+        set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
-             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9,
-             typename _Tp10, typename _Tp11>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
-                 _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
+             typename _Tp4, typename _Tp5, typename _Tp6, typename _Tp7,
+             typename _Tp8, typename _Tp9, typename _Tp10>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
+                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
-        i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
+        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
     }
 
-    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
-             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9,
-             typename _Tp10, typename _Tp11, typename _Tp12>
-    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
-                 _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11, _Tp12 a12)
+    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
+             typename _Tp4, typename _Tp5, typename _Tp6, typename _Tp7,
+             typename _Tp8, typename _Tp9, typename _Tp10, typename _Tp11>
+    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
+                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
+                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
     {
-        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
-        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
-        i = set(i, a9); i = set(i, a10); i = set(i, a11); set(i, a12);
-        return *this;
+        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
+        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
     }
 
-    void run(int dims, size_t offset[],
-             size_t globalsize[], size_t localsize[], bool sync,
-             const Ptr<Callback>& cleanupCallback=Ptr<Callback>(),
-             const Queue& q=Queue());
-    void runTask(bool sync,
-                 const Ptr<Callback>& cleanupCallback=Ptr<Callback>(),
-                 const Queue& q=Queue());
+    void run(int dims, size_t offset[], size_t globalsize[],
+             size_t localsize[], bool sync, const Queue& q=Queue());
+    void runTask(bool sync, const Queue& q=Queue());
 
     size_t workGroupSize() const;
     bool compileWorkGroupSize(size_t wsz[]) const;
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
index 3ab3db7c3f..094a80d974 100644
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -1210,6 +1210,46 @@ OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
 
 namespace cv { namespace ocl {
 
+struct UMat2D
+{
+    UMat2D(const UMat& m, int accessFlags)
+    {
+        CV_Assert(m.dims == 2);
+        data = (cl_mem)m.handle(accessFlags);
+        offset = m.offset;
+        step = m.step;
+        rows = m.rows;
+        cols = m.cols;
+    }
+    cl_mem data;
+    size_t offset;
+    size_t step;
+    int rows;
+    int cols;
+};
+
+struct UMat3D
+{
+    UMat3D(const UMat& m, int accessFlags)
+    {
+        CV_Assert(m.dims == 3);
+        data = (cl_mem)m.handle(accessFlags);
+        offset = m.offset;
+        step = m.step.p[1];
+        slicestep = m.step.p[0];
+        slices = m.size.p[0];
+        rows = m.size.p[1];
+        cols = m.size.p[2];
+    }
+    cl_mem data;
+    size_t offset;
+    size_t slicestep;
+    size_t step;
+    int slices;
+    int rows;
+    int cols;
+};
+
 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
 {
@@ -1266,6 +1306,15 @@ bool useOpenCL()
     return data->useOpenCL > 0;
 }
 
+void setUseOpenCL(bool flag)
+{
+    if( haveOpenCL() )
+    {
+        TLSData* data = TLSData::get();
+        data->useOpenCL = flag ? 1 : 0;
+    }
+}
+
 void finish()
 {
     Queue::getDefault().finish();
@@ -1980,10 +2029,33 @@ struct Kernel::Impl
         cl_int retval = 0;
         handle = ph != 0 ?
             clCreateKernel(ph, kname, &retval) : 0;
+        for( int i = 0; i < MAX_ARRS; i++ )
+            u[i] = 0;
+    }
+
+    void cleanupUMats()
+    {
+        for( int i = 0; i < MAX_ARRS; i++ )
+            if( u[i] )
+            {
+                if( CV_XADD(&u[i]->urefcount, -1) == 1 )
+                    u[i]->currAllocator->deallocate(u[i]);
+                u[i] = 0;
+            }
+        nu = 0;
+    }
+
+    void addUMat(const UMat& m)
+    {
+        CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
+        u[nu] = m.u;
+        CV_XADD(&m.u->urefcount, 1);
+        nu++;
     }
+
     void finit()
     {
-        if(!f.empty()) f->operator()();
+        cleanupUMats();
         if(e) { clReleaseEvent(e); e = 0; }
         release();
     }
@@ -1998,7 +2070,9 @@ struct Kernel::Impl
 
     cl_kernel handle;
     cl_event e;
-    Ptr<Kernel::Callback> f;
+    enum { MAX_ARRS = 16 };
+    UMatData* u[MAX_ARRS];
+    int nu;
 };
 
 }}
@@ -2086,51 +2160,48 @@ void* Kernel::ptr() const
     return p ? p->handle : 0;
 }
 
-int Kernel::set(int i, const void* value, size_t sz)
+void Kernel::set(int i, const void* value, size_t sz)
 {
     CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 );
-    return i+1;
+    if( i == 0 )
+        p->cleanupUMats();
 }
 
-int Kernel::set(int i, const UMat& m)
+void Kernel::set(int i, const UMat& m)
 {
-    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
+    set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
 }
 
-int Kernel::set(int i, const KernelArg& arg)
+void Kernel::set(int i, const KernelArg& arg)
 {
     CV_Assert( p && p->handle );
+    if( i == 0 )
+        p->cleanupUMats();
     if( arg.m )
     {
-        int dims = arg.m->dims;
-        void* h = arg.m->handle(((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
-                                ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0));
-        clSetKernelArg(p->handle, (cl_uint)i, sizeof(cl_mem), &h);
-        clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(size_t), &arg.m->offset);
-        if( dims <= 2 )
+        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
+                          ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
+        if( arg.m->dims <= 2 )
         {
-            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(size_t), &arg.m->step.p[0]);
-            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(arg.m->rows), &arg.m->rows);
-            clSetKernelArg(p->handle, (cl_uint)(i+4), sizeof(arg.m->cols), &arg.m->cols);
-            return i + 5;
+            UMat2D u2d(*arg.m, accessFlags);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d);
         }
         else
         {
-            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(size_t)*(dims-1), &arg.m->step.p[0]);
-            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(cl_int)*dims, &arg.m->size.p[0]);
-            return i + 4;
+            UMat3D u3d(*arg.m, accessFlags);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d);
         }
+        p->addUMat(*arg.m);
     }
     else
     {
         clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
-        return i+1;
     }
 }
 
 
 void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[],
-                 bool sync, const Ptr<Callback>& cleanupCallback, const Queue& q)
+                 bool sync, const Queue& q)
 {
     CV_Assert(p && p->handle && p->e == 0);
     cl_command_queue qq = getQueue(q);
@@ -2140,18 +2211,16 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz
     if( sync )
     {
         clFinish(qq);
-        if( !cleanupCallback.empty() )
-            cleanupCallback->operator()();
+        p->cleanupUMats();
     }
     else
     {
-        p->f = cleanupCallback;
         p->addref();
         clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
     }
 }
 
-void Kernel::runTask(bool sync, const Ptr<Callback>& cleanupCallback, const Queue& q)
+void Kernel::runTask(bool sync, const Queue& q)
 {
     CV_Assert(p && p->handle && p->e == 0);
     cl_command_queue qq = getQueue(q);
@@ -2159,12 +2228,10 @@ void Kernel::runTask(bool sync, const Ptr<Callback>& cleanupCallback, const Queu
     if( sync )
     {
         clFinish(qq);
-        if( !cleanupCallback.empty() )
-            cleanupCallback->operator()();
+        p->cleanupUMats();
     }
     else
     {
-        p->f = cleanupCallback;
         p->addref();
         clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
     }
-- 
GitLab