From 803f86b9ca16a92d72ba951bbb0e8cc139435170 Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Mon, 15 Oct 2018 22:57:08 +0900 Subject: [PATCH] fix test failure of PyrLKOpticalFlow.Mat * remove race condition * upload _prevPts to OpenCL device explicitly before calling "sparse" --- modules/video/src/lkpyramid.cpp | 25 ++------ modules/video/src/opencl/pyrlk.cl | 97 ++++++++++++++++--------------- 2 files changed, 56 insertions(+), 66 deletions(-) diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp index 40026cd3c1..db13082145 100644 --- a/modules/video/src/lkpyramid.cpp +++ b/modules/video/src/lkpyramid.cpp @@ -814,7 +814,7 @@ namespace double minEigThreshold_ = 1e-4) : winSize(winSize_), maxLevel(maxLevel_), criteria(criteria_), flags(flags_), minEigThreshold(minEigThreshold_) #ifdef HAVE_OPENCL - , iters(criteria_.maxCount), derivLambda(criteria_.epsilon), useInitialFlow(0 != (flags_ & OPTFLOW_LK_GET_MIN_EIGENVALS)), waveSize(0) + , iters(criteria_.maxCount), derivLambda(criteria_.epsilon), useInitialFlow(0 != (flags_ & OPTFLOW_LK_GET_MIN_EIGENVALS)) #endif { } @@ -856,8 +856,6 @@ namespace calcPatchSize(); if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6) return false; - if (!initWaveSize()) - return false; return true; } @@ -926,19 +924,6 @@ namespace int iters; double derivLambda; bool useInitialFlow; - int waveSize; - bool initWaveSize() - { - waveSize = 1; - if (isDeviceCPU()) - return true; - - ocl::Kernel kernel; - if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, "")) - return false; - waveSize = (int)kernel.preferedWorkGroupSizeMultiple(); - return true; - } dim3 patch; void calcPatchSize() { @@ -977,8 +962,8 @@ namespace if (isDeviceCPU()) build_options = " -D CPU"; else - build_options = cv::format("-D WAVE_SIZE=%d -D WSX=%d -D WSY=%d", - waveSize, wsx, wsy); + build_options = cv::format("-D WSX=%d -D WSY=%d", + wsx, wsy); ocl::Kernel kernel; if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options)) @@ -1064,7 +1049,9 @@ namespace _status.create((int)npoints, 1, CV_8UC1); UMat umatNextPts = _nextPts.getUMat(); UMat umatStatus = _status.getUMat(); - return sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr); + UMat umatPrevPts; + _prevPts.getMat().copyTo(umatPrevPts); + return sparse(_prevImg.getUMat(), _nextImg.getUMat(), umatPrevPts, umatNextPts, umatStatus, umatErr); } #endif diff --git a/modules/video/src/opencl/pyrlk.cl b/modules/video/src/opencl/pyrlk.cl index e1d934a779..c960171ead 100644 --- a/modules/video/src/opencl/pyrlk.cl +++ b/modules/video/src/opencl/pyrlk.cl @@ -53,9 +53,6 @@ #define LM_H (LSy*GRIDSIZE+2) #define BUFFER (LSx*LSy) #define BUFFER2 BUFFER>>1 -#ifndef WAVE_SIZE -#define WAVE_SIZE 1 -#endif #ifdef CPU @@ -78,7 +75,7 @@ inline void reduce3(float val1, float val2, float val3, __local float* smem1, } } -inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid) +inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -95,7 +92,7 @@ inline void reduce2(float val1, float val2, volatile __local float* smem1, volat } } -inline void reduce1(float val1, volatile __local float* smem1, int tid) +inline void reduce1(float val1, __local float* smem1, int tid) { smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); @@ -111,7 +108,7 @@ inline void reduce1(float val1, volatile __local float* smem1, int tid) } #else inline void reduce3(float val1, float val2, float val3, - __local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid) + __local float* smem1, __local float* smem2, __local float* smem3, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -123,38 +120,39 @@ inline void reduce3(float val1, float val2, float val3, smem1[tid] += smem1[tid + 32]; smem2[tid] += smem2[tid + 32]; smem3[tid] += smem3[tid + 32]; -#if WAVE_SIZE < 32 } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { -#endif smem1[tid] += smem1[tid + 16]; smem2[tid] += smem2[tid + 16]; smem3[tid] += smem3[tid + 16]; -#if WAVE_SIZE <16 } barrier(CLK_LOCAL_MEM_FENCE); - if (tid<1) + if (tid < 8) { -#endif - local float8* m1 = (local float8*)smem1; - local float8* m2 = (local float8*)smem2; - local float8* m3 = (local float8*)smem3; - float8 t1 = m1[0]+m1[1]; - float8 t2 = m2[0]+m2[1]; - float8 t3 = m3[0]+m3[1]; - float4 t14 = t1.lo + t1.hi; - float4 t24 = t2.lo + t2.hi; - float4 t34 = t3.lo + t3.hi; - smem1[0] = t14.x+t14.y+t14.z+t14.w; - smem2[0] = t24.x+t24.y+t24.z+t24.w; - smem3[0] = t34.x+t34.y+t34.z+t34.w; + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + smem3[tid] += smem3[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + smem3[tid] += smem3[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid == 0) + { + smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); + smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]); + smem3[0] = (smem3[0] + smem3[1]) + (smem3[2] + smem3[3]); } barrier(CLK_LOCAL_MEM_FENCE); } -inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid) +inline void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) { smem1[tid] = val1; smem2[tid] = val2; @@ -164,33 +162,35 @@ inline void reduce2(float val1, float val2, __local volatile float* smem1, __loc { smem1[tid] += smem1[tid + 32]; smem2[tid] += smem2[tid + 32]; -#if WAVE_SIZE < 32 } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { -#endif smem1[tid] += smem1[tid + 16]; smem2[tid] += smem2[tid + 16]; -#if WAVE_SIZE <16 } barrier(CLK_LOCAL_MEM_FENCE); - if (tid<1) + if (tid < 8) { -#endif - local float8* m1 = (local float8*)smem1; - local float8* m2 = (local float8*)smem2; - float8 t1 = m1[0]+m1[1]; - float8 t2 = m2[0]+m2[1]; - float4 t14 = t1.lo + t1.hi; - float4 t24 = t2.lo + t2.hi; - smem1[0] = t14.x+t14.y+t14.z+t14.w; - smem2[0] = t24.x+t24.y+t24.z+t24.w; + smem1[tid] += smem1[tid + 8]; + smem2[tid] += smem2[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { + smem1[tid] += smem1[tid + 4]; + smem2[tid] += smem2[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid == 0) + { + smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); + smem2[0] = (smem2[0] + smem2[1]) + (smem2[2] + smem2[3]); } barrier(CLK_LOCAL_MEM_FENCE); } -inline void reduce1(float val1, __local volatile float* smem1, int tid) +inline void reduce1(float val1, __local float* smem1, int tid) { smem1[tid] = val1; barrier(CLK_LOCAL_MEM_FENCE); @@ -198,23 +198,26 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid) if (tid < 32) { smem1[tid] += smem1[tid + 32]; -#if WAVE_SIZE < 32 } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 16) { -#endif smem1[tid] += smem1[tid + 16]; -#if WAVE_SIZE <16 } barrier(CLK_LOCAL_MEM_FENCE); - if (tid<1) + if (tid < 8) { -#endif - local float8* m1 = (local float8*)smem1; - float8 t1 = m1[0]+m1[1]; - float4 t14 = t1.lo + t1.hi; - smem1[0] = t14.x+t14.y+t14.z+t14.w; + smem1[tid] += smem1[tid + 8]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid < 4) + { + smem1[tid] += smem1[tid + 4]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (tid == 0) + { + smem1[0] = (smem1[0] + smem1[1]) + (smem1[2] + smem1[3]); } barrier(CLK_LOCAL_MEM_FENCE); } -- GitLab