From fd4a6f0af05a0c4e0461b05f2bd20c85bfcbe73c Mon Sep 17 00:00:00 2001 From: yao Date: Wed, 3 Apr 2013 13:23:04 +0800 Subject: [PATCH] make the sparse method give correct results on CPU ocl Add CL_CPU to supportsFeature check simplify the logic of pyrlk --- modules/ocl/include/opencv2/ocl/ocl.hpp | 2 +- modules/ocl/src/initialization.cpp | 6 + modules/ocl/src/matrix_operations.cpp | 2 +- modules/ocl/src/opencl/pyrlk.cl | 278 ++++++++++++- modules/ocl/src/pyrlk.cpp | 520 ++---------------------- 5 files changed, 307 insertions(+), 501 deletions(-) diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index da7ca27aeb..7b79cb5b27 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -155,7 +155,7 @@ namespace cv static Context* getContext(); static void setContext(Info &oclinfo); - enum {CL_DOUBLE, CL_UNIFIED_MEM}; + enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_CPU}; bool supportsFeature(int ftype); size_t computeUnits(); void* oclContext(); diff --git a/modules/ocl/src/initialization.cpp b/modules/ocl/src/initialization.cpp index b582f1ce3e..78263d86ae 100644 --- a/modules/ocl/src/initialization.cpp +++ b/modules/ocl/src/initialization.cpp @@ -979,6 +979,12 @@ namespace cv return impl->double_support == 1; case CL_UNIFIED_MEM: return impl->unified_memory == 1; + case CL_CPU: + cl_device_type devicetype; + clGetDeviceInfo(impl->devices[impl->devnum], + CL_DEVICE_TYPE, sizeof(cl_device_type), + &devicetype, NULL); + return devicetype == CVCL_DEVICE_TYPE_CPU; default: return false; } diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index ce96e3a9e3..87d1d375ef 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -394,7 +394,7 @@ void cv::ocl::oclMat::convertTo( oclMat &dst, int rtype, double alpha, double be if( rtype < 0 ) rtype = type(); else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels()); + rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), oclchannels()); //int scn = channels(); int sdepth = depth(), ddepth = CV_MAT_DEPTH(rtype); diff --git a/modules/ocl/src/opencl/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl index c772be78ac..1043b8410b 100644 --- a/modules/ocl/src/opencl/pyrlk.cl +++ b/modules/ocl/src/opencl/pyrlk.cl @@ -184,6 +184,209 @@ float linearFilter_float(__global const float* src, int srcStep, int cn, float2 } #define BUFFER 64 + +#ifdef CPU +void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = val1 += smem1[tid + 32]; + smem2[tid] = val2 += smem2[tid + 32]; + smem3[tid] = val3 += smem3[tid + 32]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = val1 += smem1[tid + 16]; + smem2[tid] = val2 += smem2[tid + 16]; + smem3[tid] = val3 += smem3[tid + 16]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = val1 += smem1[tid + 8]; + smem2[tid] = val2 += smem2[tid + 8]; + smem3[tid] = val3 += smem3[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = val1 += smem1[tid + 4]; + smem2[tid] = val2 += smem2[tid + 4]; + smem3[tid] = val3 += smem3[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = val1 += smem1[tid + 2]; + smem2[tid] = val2 += smem2[tid + 2]; + smem3[tid] = val3 += smem3[tid + 2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = val1 += smem1[tid + 1]; + smem2[BUFFER] = val2 += smem2[tid + 1]; + smem3[BUFFER] = val3 += smem3[tid + 1]; + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + smem2[tid] = (val2 += smem2[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + smem2[tid] = (val2 += smem2[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + smem2[tid] = (val2 += smem2[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + smem2[tid] = (val2 += smem2[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + smem2[tid] = (val2 += smem2[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + smem2[tid] = (val2 += smem2[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + smem2[tid] = (val2 += smem2[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + smem2[BUFFER] = (val2 += smem2[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} + +void reduce1(float val1, volatile __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + +#if BUFFER > 128 + if (tid < 128) + { + smem1[tid] = (val1 += smem1[tid + 128]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + +#if BUFFER > 64 + if (tid < 64) + { + smem1[tid] = (val1 += smem1[tid + 64]); + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + if (tid < 32) + { + smem1[tid] = (val1 += smem1[tid + 32]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 16) + { + smem1[tid] = (val1 += smem1[tid + 16]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 8) + { + smem1[tid] = (val1 += smem1[tid + 8]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 4) + { + smem1[tid] = (val1 += smem1[tid + 4]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 2) + { + smem1[tid] = (val1 += smem1[tid + 2]); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 1) + { + smem1[BUFFER] = (val1 += smem1[tid + 1]); + } + barrier(CLK_LOCAL_MEM_FENCE); +} +#else void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; @@ -325,6 +528,7 @@ void reduce1(float val1, __local float* smem1, int tid) vmem1[tid] = val1 += vmem1[tid + 1]; } } +#endif #define SCALE (1.0f / (1 << 20)) #define THRESHOLD 0.01f @@ -411,14 +615,20 @@ void GetError4(image2d_t J, const float x, const float y, const float4* Pch, flo *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); } - +#define GRIDSIZE 3 __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else __local float smem1[BUFFER]; __local float smem2[BUFFER]; __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -431,7 +641,7 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 prevPt = prevPts[gid] / (1 << level); + float2 prevPt = prevPts[gid] / (float2)(1 << level); if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) { @@ -450,9 +660,9 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, float A12 = 0; float A22 = 0; - float I_patch[3][3]; - float dIdx_patch[3][3]; - float dIdy_patch[3][3]; + float I_patch[GRIDSIZE][GRIDSIZE]; + float dIdx_patch[GRIDSIZE][GRIDSIZE]; + float dIdy_patch[GRIDSIZE][GRIDSIZE]; yBase=yid; { @@ -512,12 +722,19 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2], &A11, &A12, &A22); } + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -609,8 +826,13 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -685,18 +907,28 @@ __kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, nextPts[gid] = prevPt; if (calcErr) - err[gid] = smem1[0] / (c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y); +#endif } - } + __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) { - __local float smem1[BUFFER]; - __local float smem2[BUFFER]; - __local float smem3[BUFFER]; +#ifdef CPU + __local float smem1[BUFFER+1]; + __local float smem2[BUFFER+1]; + __local float smem3[BUFFER+1]; +#else + __local float smem1[BUFFER]; + __local float smem2[BUFFER]; + __local float smem3[BUFFER]; +#endif unsigned int xid=get_local_id(0); unsigned int yid=get_local_id(1); @@ -709,7 +941,7 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, const int tid = mad24(yid, xsize, xid); - float2 nextPt = prevPts[gid]/(1<= cols || nextPt.y < 0 || nextPt.y >= rows) { @@ -725,9 +957,9 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, // extract the patch from the first image, compute covariation matrix of derivatives - float A11 = 0; - float A12 = 0; - float A22 = 0; + float A11 = 0.0f; + float A12 = 0.0f; + float A22 = 0.0f; float4 I_patch[8]; float4 dIdx_patch[8]; @@ -797,9 +1029,15 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, reduce3(A11, A12, A22, smem1, smem2, smem3, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + A11 = smem1[BUFFER]; + A12 = smem2[BUFFER]; + A22 = smem3[BUFFER]; +#else A11 = smem1[0]; A12 = smem2[0]; A22 = smem3[0]; +#endif float D = A11 * A22 - A12 * A12; @@ -888,12 +1126,16 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, &b1, &b2); } - reduce2(b1, b2, smem1, smem2, tid); barrier(CLK_LOCAL_MEM_FENCE); +#ifdef CPU + b1 = smem1[BUFFER]; + b2 = smem2[BUFFER]; +#else b1 = smem1[0]; b2 = smem2[0]; +#endif float2 delta; delta.x = A12 * b2 - A22 * b1; @@ -967,7 +1209,11 @@ __kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, nextPts[gid] = nextPt; if (calcErr) - err[gid] = smem1[0] / (3 * c_winSize_x * c_winSize_y); +#ifdef CPU + err[gid] = smem1[BUFFER] / (float)(3 * c_winSize_x * c_winSize_y); +#else + err[gid] = smem1[0] / (float)(3 * c_winSize_x * c_winSize_y); +#endif } } diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp index c8d4b52deb..374134c1cd 100644 --- a/modules/ocl/src/pyrlk.cpp +++ b/modules/ocl/src/pyrlk.cpp @@ -16,7 +16,7 @@ // // @Authors // Dachuan Zhao, dachuan@multicorewareinc.com -// Yao Wang, yao@multicorewareinc.com +// Yao Wang, bitwangyaoyao@gmail.com // Nathan, liujun@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, @@ -47,6 +47,7 @@ #include "precomp.hpp" + using namespace std; using namespace cv; using namespace cv::ocl; @@ -58,11 +59,7 @@ namespace ocl ///////////////////////////OpenCL kernel strings/////////////////////////// extern const char *pyrlk; extern const char *pyrlk_no_image; -extern const char *operator_setTo; -extern const char *operator_convertTo; -extern const char *operator_copyToM; extern const char *arithm_mul; -extern const char *pyr_down; } } @@ -105,364 +102,7 @@ void calcPatchSize(cv::Size winSize, int cn, dim3 &block, dim3 &patch, bool isDe } } -inline int divUp(int total, int grain) -{ - return (total + grain - 1) / grain; -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// ConvertTo //////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -static void convert_run_cus(const oclMat &src, oclMat &dst, double alpha, double beta) -{ - string kernelName = "convert_to_S"; - stringstream idxStr; - idxStr << src.depth(); - kernelName += idxStr.str(); - float alpha_f = (float)alpha, beta_f = (float)beta; - CV_DbgAssert(src.rows == dst.rows && src.cols == dst.cols); - vector > args; - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); - int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; - } - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&alpha_f )); - args.push_back( make_pair( sizeof(cl_float) , (void *)&beta_f )); - openCLExecuteKernel2(dst.clCxt , &operator_convertTo, kernelName, globalThreads, - localThreads, args, dst.oclchannels(), dst.depth(), CLFLUSH); -} -void convertTo( const oclMat &src, oclMat &m, int rtype, double alpha = 1, double beta = 0 ); -void convertTo( const oclMat &src, oclMat &dst, int rtype, double alpha, double beta ) -{ - //cout << "cv::ocl::oclMat::convertTo()" << endl; - - bool noScale = fabs(alpha - 1) < std::numeric_limits::epsilon() - && fabs(beta) < std::numeric_limits::epsilon(); - - if( rtype < 0 ) - rtype = src.type(); - else - rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.oclchannels()); - - int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype); - if( sdepth == ddepth && noScale ) - { - src.copyTo(dst); - return; - } - - oclMat temp; - const oclMat *psrc = &src; - if( sdepth != ddepth && psrc == &dst ) - psrc = &(temp = src); - - dst.create( src.size(), rtype ); - convert_run_cus(*psrc, dst, alpha, beta); -} - -/////////////////////////////////////////////////////////////////////////// -//////////////////////////////// setTo //////////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -//oclMat &operator = (const Scalar &s) -//{ -// //cout << "cv::ocl::oclMat::=" << endl; -// setTo(s); -// return *this; -//} -static void set_to_withoutmask_run_cus(const oclMat &dst, const Scalar &scalar, string kernelName) -{ - vector > args; - - size_t localThreads[3] = {16, 16, 1}; - size_t globalThreads[3]; - globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - globalThreads[1] = (dst.rows + localThreads[1] - 1) / localThreads[1] * localThreads[1]; - globalThreads[2] = 1; - int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); - if(dst.type() == CV_8UC1) - { - globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; - } - char compile_option[32]; - union sc - { - cl_uchar4 uval; - cl_char4 cval; - cl_ushort4 usval; - cl_short4 shval; - cl_int4 ival; - cl_float4 fval; - cl_double4 dval; - } val; - switch(dst.depth()) - { - case 0: - val.uval.s[0] = saturate_cast(scalar.val[0]); - val.uval.s[1] = saturate_cast(scalar.val[1]); - val.uval.s[2] = saturate_cast(scalar.val[2]); - val.uval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=uchar"); - args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=uchar4"); - args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 1: - val.cval.s[0] = saturate_cast(scalar.val[0]); - val.cval.s[1] = saturate_cast(scalar.val[1]); - val.cval.s[2] = saturate_cast(scalar.val[2]); - val.cval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=char"); - args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=char4"); - args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 2: - val.usval.s[0] = saturate_cast(scalar.val[0]); - val.usval.s[1] = saturate_cast(scalar.val[1]); - val.usval.s[2] = saturate_cast(scalar.val[2]); - val.usval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=ushort"); - args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=ushort4"); - args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 3: - val.shval.s[0] = saturate_cast(scalar.val[0]); - val.shval.s[1] = saturate_cast(scalar.val[1]); - val.shval.s[2] = saturate_cast(scalar.val[2]); - val.shval.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=short"); - args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=short4"); - args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 4: - val.ival.s[0] = saturate_cast(scalar.val[0]); - val.ival.s[1] = saturate_cast(scalar.val[1]); - val.ival.s[2] = saturate_cast(scalar.val[2]); - val.ival.s[3] = saturate_cast(scalar.val[3]); - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=int"); - args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] )); - break; - case 2: - sprintf(compile_option, "-D GENTYPE=int2"); - cl_int2 i2val; - i2val.s[0] = val.ival.s[0]; - i2val.s[1] = val.ival.s[1]; - args.push_back( make_pair( sizeof(cl_int2) , (void *)&i2val )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=int4"); - args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 5: - val.fval.s[0] = (float)scalar.val[0]; - val.fval.s[1] = (float)scalar.val[1]; - val.fval.s[2] = (float)scalar.val[2]; - val.fval.s[3] = (float)scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=float"); - args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=float4"); - args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - case 6: - val.dval.s[0] = scalar.val[0]; - val.dval.s[1] = scalar.val[1]; - val.dval.s[2] = scalar.val[2]; - val.dval.s[3] = scalar.val[3]; - switch(dst.oclchannels()) - { - case 1: - sprintf(compile_option, "-D GENTYPE=double"); - args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] )); - break; - case 4: - sprintf(compile_option, "-D GENTYPE=double4"); - args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval )); - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unsupported channels"); - } - break; - default: - CV_Error(CV_StsUnsupportedFormat, "unknown depth"); - } -#ifdef CL_VERSION_1_2 - if(dst.offset == 0 && dst.cols == dst.wholecols) - { - clEnqueueFillBuffer((cl_command_queue)dst.clCxt->oclCommandQueue(), (cl_mem)dst.data, args[0].second, args[0].first, 0, dst.step * dst.rows, 0, NULL, NULL); - } - else - { - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); - } -#else - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); - openCLExecuteKernel2(dst.clCxt , &operator_setTo, kernelName, globalThreads, - localThreads, args, -1, -1, compile_option, CLFLUSH); -#endif -} - -static oclMat &setTo(oclMat &src, const Scalar &scalar) -{ - CV_Assert( src.depth() >= 0 && src.depth() <= 6 ); - CV_DbgAssert( !src.empty()); - - if(src.type() == CV_8UC1) - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask_C1_D0"); - } - else - { - set_to_withoutmask_run_cus(src, scalar, "set_to_without_mask"); - } - - return src; -} - -/////////////////////////////////////////////////////////////////////////// -////////////////////////////////// CopyTo ///////////////////////////////// -/////////////////////////////////////////////////////////////////////////// -// static void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName) -// { -// CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols && -// src.rows == dst.rows && src.cols == dst.cols -// && mask.type() == CV_8UC1); - -// vector > args; - -// std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"}, -// {"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"}, -// {"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"}, -// {"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"} -// }; -// char compile_option[32]; -// sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str()); -// size_t localThreads[3] = {16, 16, 1}; -// size_t globalThreads[3]; - -// globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0]; -// globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1]; -// globalThreads[2] = 1; - -// int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize(); -// int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize(); - -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); -// args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); -// args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); - -// openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads, -// localThreads, args, -1, -1, compile_option, CLFLUSH); -// } - -static void copyTo(const oclMat &src, oclMat &m ) -{ - CV_DbgAssert(!src.empty()); - m.create(src.size(), src.type()); - openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset, - src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset); -} - -// static void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask) -// { -// if (mask.empty()) -// { -// copyTo(src, mat); -// } -// else -// { -// mat.create(src.size(), src.type()); -// copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask"); -// } -// } - -static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) +static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) { if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F) { @@ -470,9 +110,6 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c return; } - //dst.create(src1.size(), src1.type()); - //CV_Assert(src1.cols == src2.cols && src2.cols == dst.cols && - // src1.rows == src2.rows && src2.rows == dst.rows); CV_Assert(src1.cols == dst.cols && src1.rows == dst.rows); @@ -480,24 +117,8 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c CV_Assert(src1.depth() != CV_8S); Context *clCxt = src1.clCxt; - //int channels = dst.channels(); - //int depth = dst.depth(); - - //int vector_lengths[4][7] = {{4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1}, - // {4, 0, 4, 4, 1, 1, 1} - //}; - - //size_t vector_length = vector_lengths[channels-1][depth]; - //int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); - //int cols = divUp(dst.cols * channels + offset_cols, vector_length); size_t localThreads[3] = { 16, 16, 1 }; - //size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], - // divUp(dst.rows, localThreads[1]) * localThreads[1], - // 1 - // }; size_t globalThreads[3] = { src1.cols, src1.rows, 1 @@ -508,67 +129,20 @@ static void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, c args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset )); - //args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step )); - //args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset )); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); + args.push_back( make_pair( sizeof(float), (float *)&scalar )); - //if(_scalar != NULL) - //{ - float scalar1 = *((float *)_scalar); - args.push_back( make_pair( sizeof(float), (float *)&scalar1 )); - //} - - openCLExecuteKernel2(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, src1.depth(), CLFLUSH); -} - -static void multiply_cus(const oclMat &src1, oclMat &dst, float scalar) -{ - arithmetic_run(src1, dst, "arithm_muls", &arithm_mul, (void *)(&scalar)); -} - -static void pyrdown_run_cus(const oclMat &src, const oclMat &dst) -{ - - CV_Assert(src.type() == dst.type()); - CV_Assert(src.depth() != CV_8S); - - Context *clCxt = src.clCxt; - - string kernelName = "pyrDown"; - - size_t localThreads[3] = { 256, 1, 1 }; - size_t globalThreads[3] = { src.cols, dst.rows, 1}; - - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols)); - - openCLExecuteKernel2(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth(), CLFLUSH); -} - -static void pyrDown_cus(const oclMat &src, oclMat &dst) -{ - CV_Assert(src.depth() <= CV_32F && src.channels() <= 4); - - dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type()); - - pyrdown_run_cus(src, dst); + openCLExecuteKernel(clCxt, &arithm_mul, "arithm_muls", globalThreads, localThreads, args, -1, src1.depth()); } static void lkSparse_run(oclMat &I, oclMat &J, - const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, - int level, /*dim3 block, */dim3 patch, Size winSize, int iters) + const oclMat &prevPts, oclMat &nextPts, oclMat &status, oclMat& err, bool /*GET_MIN_EIGENVALS*/, int ptcount, + int level, dim3 patch, Size winSize, int iters) { Context *clCxt = I.clCxt; int elemCntPerRow = I.step / I.elemSize(); @@ -603,7 +177,7 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&level )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); - if (!isImageSupported) + if (!isImageSupported) args.push_back( make_pair( sizeof(cl_int), (void *)&elemCntPerRow ) ); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); @@ -613,15 +187,24 @@ static void lkSparse_run(oclMat &I, oclMat &J, args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); - if(isImageSupported) + if (clCxt->supportsFeature(Context::CL_CPU)) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), (char*)" -D CPU"); releaseTexture(ITex); releaseTexture(JTex); } else { - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + if(isImageSupported) + { + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + releaseTexture(ITex); + releaseTexture(JTex); + } + else + { + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); + } } } @@ -631,7 +214,7 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { nextPts.release(); status.release(); - //if (err) err->release(); + if (err) err->release(); return; } @@ -657,13 +240,11 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); oclMat temp2 = nextPts.reshape(1); - //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f)); multiply_cus(temp1, temp2, 1.0f / (1 << maxLevel) / 2.0f); //::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2); ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); - //status.setTo(Scalar::all(1)); - setTo(status, Scalar::all(1)); + status.setTo(Scalar::all(1)); bool errMat = false; if (!err) @@ -673,7 +254,6 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next } else ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); - //ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, err); // build the image pyramids. @@ -682,25 +262,14 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next if (cn == 1 || cn == 4) { - //prevImg.convertTo(prevPyr_[0], CV_32F); - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(prevImg, prevPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); - } - else - { - //oclMat buf_; - // cvtColor(prevImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(prevPyr_[0], CV_32F); - - // cvtColor(nextImg, buf_, COLOR_BGR2BGRA); - // buf_.convertTo(nextPyr_[0], CV_32F); + prevImg.convertTo(prevPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); } for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } // dI/dx ~ Ix, dI/dy ~ Iy @@ -709,17 +278,15 @@ void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat &prevImg, const oclMat &next { lkSparse_run(prevPyr_[level], nextPyr_[level], prevPts, nextPts, status, *err, getMinEigenVals, prevPts.cols, - level, /*block, */patch, winSize, iters); + level, patch, winSize, iters); } - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); - if(errMat) delete err; } static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, - oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) + oclMat &prevU, oclMat &prevV, oclMat *err, Size winSize, int iters) { Context *clCxt = I.clCxt; bool isImageSupported = support_image2d(); @@ -754,11 +321,6 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, JTex = (cl_mem)J.data; } - //int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2}; - //const int patchWidth = 16 + 2 * halfWin.x; - //const int patchHeight = 16 + 2 * halfWin.y; - //size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); - vector > args; args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); @@ -787,15 +349,14 @@ static void lkDense_run(oclMat &I, oclMat &J, oclMat &u, oclMat &v, if (isImageSupported) { - openCLExecuteKernel2(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); releaseTexture(ITex); releaseTexture(JTex); } else { - //printf("Warning: The image2d_t is not supported by the device. Using alternative method!\n"); - openCLExecuteKernel2(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth(), CLFLUSH); + openCLExecuteKernel(clCxt, &pyrlk_no_image, kernelName, globalThreads, localThreads, args, I.oclchannels(), I.depth()); } } @@ -813,23 +374,20 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI nextPyr_.resize(maxLevel + 1); prevPyr_[0] = prevImg; - //nextImg.convertTo(nextPyr_[0], CV_32F); - convertTo(nextImg, nextPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); for (int level = 1; level <= maxLevel; ++level) { - pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]); - pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]); + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); } ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); - //uPyr_[1].setTo(Scalar::all(0)); - //vPyr_[1].setTo(Scalar::all(0)); - setTo(uPyr_[1], Scalar::all(0)); - setTo(vPyr_[1], Scalar::all(0)); + uPyr_[1].setTo(Scalar::all(0)); + vPyr_[1].setTo(Scalar::all(0)); Size winSize2i(winSize.width, winSize.height); @@ -846,10 +404,6 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI idx = idx2; } - //uPyr_[idx].copyTo(u); - //vPyr_[idx].copyTo(v); - copyTo(uPyr_[idx], u); - copyTo(vPyr_[idx], v); - - clFinish((cl_command_queue)prevImg.clCxt->oclCommandQueue()); + uPyr_[idx].copyTo(u); + vPyr_[idx].copyTo(v); } -- GitLab