From 78e89890b0b4be6827dc8123316799856dd28eaa Mon Sep 17 00:00:00 2001 From: yao Date: Mon, 17 Sep 2012 09:48:34 +0800 Subject: [PATCH] add PyrLK to ocl module --- .../include/opencv2/ocl/matrix_operations.hpp | 12 + modules/ocl/include/opencv2/ocl/ocl.hpp | 60 ++ modules/ocl/src/kernels/pyrlk.cl | 831 ++++++++++++++++++ modules/ocl/src/matrix_operations.cpp | 1 + modules/ocl/src/pyrlk.cpp | 597 +++++++++++++ modules/ocl/test/test_pyrlk.cpp | 167 ++++ 6 files changed, 1668 insertions(+) create mode 100644 modules/ocl/src/kernels/pyrlk.cl create mode 100644 modules/ocl/src/pyrlk.cpp create mode 100644 modules/ocl/test/test_pyrlk.cpp diff --git a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp index 06eec380a4..89fce5a643 100644 --- a/modules/ocl/include/opencv2/ocl/matrix_operations.hpp +++ b/modules/ocl/include/opencv2/ocl/matrix_operations.hpp @@ -460,6 +460,18 @@ namespace cv a.swap(b); } + inline void ensureSizeIsEnough(int rows, int cols, int type, oclMat& m) + { + if (m.type() == type && m.rows >= rows && m.cols >= cols) + m = m(Rect(0, 0, cols, rows)); + else + m.create(rows, cols, type); + } + + inline void ensureSizeIsEnough(Size size, int type, oclMat& m) + { + ensureSizeIsEnough(size.height, size.width, type, m); + } } /* end of namespace ocl */ } /* end of namespace cv */ diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index 8ac49373cc..b0d6d83c93 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1125,6 +1125,66 @@ namespace cv explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {} }; + /////////////////////////////// PyrLKOpticalFlow ///////////////////////////////////// + class CV_EXPORTS PyrLKOpticalFlow + { + public: + PyrLKOpticalFlow() + { + winSize = Size(21, 21); + maxLevel = 3; + iters = 30; + derivLambda = 0.5; + useInitialFlow = false; + minEigThreshold = 1e-4f; + getMinEigenVals = false; + isDeviceArch11_ = false; + } + + void sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, + oclMat& status, oclMat* err = 0); + + void dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err = 0); + + Size winSize; + int maxLevel; + int iters; + double derivLambda; + bool useInitialFlow; + float minEigThreshold; + bool getMinEigenVals; + + void releaseMemory() + { + dx_calcBuf_.release(); + dy_calcBuf_.release(); + + prevPyr_.clear(); + nextPyr_.clear(); + + dx_buf_.release(); + dy_buf_.release(); + } + + private: + void calcSharrDeriv(const oclMat& src, oclMat& dx, oclMat& dy); + + void buildImagePyramid(const oclMat& img0, vector& pyr, bool withBorder); + + oclMat dx_calcBuf_; + oclMat dy_calcBuf_; + + vector prevPyr_; + vector nextPyr_; + + oclMat dx_buf_; + oclMat dy_buf_; + + oclMat uPyr_[2]; + oclMat vPyr_[2]; + + bool isDeviceArch11_; + }; } } diff --git a/modules/ocl/src/kernels/pyrlk.cl b/modules/ocl/src/kernels/pyrlk.cl new file mode 100644 index 0000000000..15469f8703 --- /dev/null +++ b/modules/ocl/src/kernels/pyrlk.cl @@ -0,0 +1,831 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// @Authors +// Dachuan Zhao, dachuan@multicorewareinc.com +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other oclMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +//#pragma OPENCL EXTENSION cl_amd_printf : enable + + +__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (y < rows && x < cols * cn) + { + const uchar src_val0 = (src + (y > 0 ? y-1 : rows > 1 ? 1 : 0) * srcStep)[x]; + const uchar src_val1 = (src + y * srcStep)[x]; + const uchar src_val2 = (src + (y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0) * srcStep)[x]; + + ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; + ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; + } +} + +__kernel void calcSharrDeriv_vertical_C4_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (y < rows && x < cols * cn) + { + const uchar src_val0 = (src + (y > 0 ? y - 1 : 1) * srcStep)[x]; + const uchar src_val1 = (src + y * srcStep)[x]; + const uchar src_val2 = (src + (y < rows - 1 ? y + 1 : rows - 2) * srcStep)[x]; + + ((__global short*)((__global char*)dx_buf + y * dx_bufStep / 2))[x] = (src_val0 + src_val2) * 3 + src_val1 * 10; + ((__global short*)((__global char*)dy_buf + y * dy_bufStep / 2))[x] = src_val2 - src_val0; + } +} + +__kernel void calcSharrDeriv_horizontal_C1_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int colsn = cols * cn; + + if (y < rows && x < colsn) + { + __global const short* dx_buf_row = dx_buf + y * dx_bufStep; + __global const short* dy_buf_row = dy_buf + y * dy_bufStep; + + const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; + const int xl = x - cn >= 0 ? x - cn : cn + x; + + ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; + ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; + } +} + +__kernel void calcSharrDeriv_horizontal_C4_D0(int rows, int cols, int cn, __global const short* dx_buf, int dx_bufStep, __global const short* dy_buf, int dy_bufStep, __global short* dIdx, int dIdxStep, __global short* dIdy, int dIdyStep) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + const int colsn = cols * cn; + + if (y < rows && x < colsn) + { + __global const short* dx_buf_row = dx_buf + y * dx_bufStep; + __global const short* dy_buf_row = dy_buf + y * dy_bufStep; + + const int xr = x + cn < colsn ? x + cn : (cols - 2) * cn + x + cn - colsn; + const int xl = x - cn >= 0 ? x - cn : cn + x; + + ((__global short*)((__global char*)dIdx + y * dIdxStep / 2))[x] = dx_buf_row[xr] - dx_buf_row[xl]; + ((__global short*)((__global char*)dIdy + y * dIdyStep / 2))[x] = (dy_buf_row[xr] + dy_buf_row[xl]) * 3 + dy_buf_row[x] * 10; + } +} + +#define W_BITS 14 +#define W_BITS1 14 + +#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n)) + +int linearFilter_uchar(__global const uchar* src, int srcStep, int cn, float2 pt, int x, int y) +{ + int2 ipt; + ipt.x = convert_int_sat_rtn(pt.x); + ipt.y = convert_int_sat_rtn(pt.y); + + float a = pt.x - ipt.x; + float b = pt.y - ipt.y; + + int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); + int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); + int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + __global const uchar* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; + __global const uchar* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; + + return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1 - 5); +} + +int linearFilter_short(__global const short* src, int srcStep, int cn, float2 pt, int x, int y) +{ + int2 ipt; + ipt.x = convert_int_sat_rtn(pt.x); + ipt.y = convert_int_sat_rtn(pt.y); + + float a = pt.x - ipt.x; + float b = pt.y - ipt.y; + + int iw00 = convert_int_sat_rte((1.0f - a) * (1.0f - b) * (1 << W_BITS)); + int iw01 = convert_int_sat_rte(a * (1.0f - b) * (1 << W_BITS)); + int iw10 = convert_int_sat_rte((1.0f - a) * b * (1 << W_BITS)); + int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + __global const short* src_row = src + (ipt.y + y) * srcStep + ipt.x * cn; + __global const short* src_row1 = src + (ipt.y + y + 1) * srcStep + ipt.x * cn; + + return CV_DESCALE(src_row[x] * iw00 + src_row[x + cn] * iw01 + src_row1[x] * iw10 + src_row1[x + cn] * iw11, W_BITS1); +} + +float linearFilter_float(__global const float* src, int srcStep, int cn, float2 pt, float x, float y) +{ + int2 ipt; + ipt.x = convert_int_sat_rtn(pt.x); + ipt.y = convert_int_sat_rtn(pt.y); + + float a = pt.x - ipt.x; + float b = pt.y - ipt.y; + + float iw00 = ((1.0f - a) * (1.0f - b) * (1 << W_BITS)); + float iw01 = (a * (1.0f - b) * (1 << W_BITS)); + float iw10 = ((1.0f - a) * b * (1 << W_BITS)); + float iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; + + __global const float* src_row = src + (int)(ipt.y + y) * srcStep / 4 + ipt.x * cn; + __global const float* src_row1 = src + (int)(ipt.y + y + 1) * srcStep / 4 + ipt.x * cn; + + return src_row[(int)x] * iw00 + src_row[(int)x + cn] * iw01 + src_row1[(int)x] * iw10 + src_row1[(int)x + cn] * iw11, W_BITS1 - 5; +} + +void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + smem3[tid] = val3; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + smem3[tid] = val3 += smem3[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + smem3[tid] = val3 += smem3[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + volatile __local float* vmem1 = smem1; + volatile __local float* vmem2 = smem2; + volatile __local float* vmem3 = smem3; + + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem2[tid] = val2 += vmem2[tid + 32]; + vmem3[tid] = val3 += vmem3[tid + 32]; + + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem2[tid] = val2 += vmem2[tid + 16]; + vmem3[tid] = val3 += vmem3[tid + 16]; + + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; + vmem3[tid] = val3 += vmem3[tid + 8]; + + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; + vmem3[tid] = val3 += vmem3[tid + 4]; + + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; + vmem3[tid] = val3 += vmem3[tid + 2]; + + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; + vmem3[tid] = val3 += vmem3[tid + 1]; + } +} + +void reduce2(float val1, float val2, __local float* smem1, __local float* smem2, int tid) +{ + smem1[tid] = val1; + smem2[tid] = val2; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + smem2[tid] = val2 += smem2[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + smem2[tid] = val2 += smem2[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + volatile __local float* vmem1 = smem1; + volatile __local float* vmem2 = smem2; + + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem2[tid] = val2 += vmem2[tid + 32]; + + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem2[tid] = val2 += vmem2[tid + 16]; + + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem2[tid] = val2 += vmem2[tid + 8]; + + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem2[tid] = val2 += vmem2[tid + 4]; + + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem2[tid] = val2 += vmem2[tid + 2]; + + vmem1[tid] = val1 += vmem1[tid + 1]; + vmem2[tid] = val2 += vmem2[tid + 1]; + } +} + +void reduce1(float val1, __local float* smem1, int tid) +{ + smem1[tid] = val1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 128) + { + smem1[tid] = val1 += smem1[tid + 128]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 64) + { + smem1[tid] = val1 += smem1[tid + 64]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (tid < 32) + { + volatile __local float* vmem1 = smem1; + + vmem1[tid] = val1 += vmem1[tid + 32]; + vmem1[tid] = val1 += vmem1[tid + 16]; + vmem1[tid] = val1 += vmem1[tid + 8]; + vmem1[tid] = val1 += vmem1[tid + 4]; + vmem1[tid] = val1 += vmem1[tid + 2]; + vmem1[tid] = val1 += vmem1[tid + 1]; + } +} + +#define SCALE (1.0f / (1 << 20)) + +// Image read mode +__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR; + +__kernel void lkSparse_C1_D5(image2d_t I, image2d_t J, + __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS) +{ + __local float smem1[256]; + __local float smem2[256]; + __local float smem3[256]; + + int c_halfWin_x = (c_winSize_x - 1) / 2; + int c_halfWin_y = (c_winSize_y - 1) / 2; + + const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); + + float2 prevPt = prevPts[get_group_id(0)]; + prevPt.x *= (1.0f / (1 << level)); + prevPt.y *= (1.0f / (1 << level)); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (level == 0 && tid == 0) + { + status[get_group_id(0)] = 0; + + //if (calcErr) + // err[get_group_id(0)] = 0; + } + + return; + } + + prevPt.x -= c_halfWin_x; + prevPt.y -= c_halfWin_y; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float I_patch[21][21]; + float dIdx_patch[21][21]; + float dIdy_patch[21][21]; + + for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) + { + for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) + { + float x = (prevPt.x + xBase + 0.5f); + float y = (prevPt.y + yBase + 0.5f); + + I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x; + + float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); + + float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); + + dIdx_patch[i][j] = dIdx; + dIdy_patch[i][j] = dIdy; + + A11 += dIdx * dIdx; + A12 += dIdx * dIdy; + A22 += dIdy * dIdy; + } + } + + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + A11 = smem1[0]; + A12 = smem2[0]; + A22 = smem3[0]; + + float D = A11 * A22 - A12 * A12; + + //if (calcErr && GET_MIN_EIGENVALS && tid == 0) + // err[get_group_id(0)] = minEig; + + if (D < 1.192092896e-07f) + { + if (level == 0 && tid == 0) + status[get_group_id(0)] = 0; + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt = nextPts[get_group_id(0)]; + nextPt.x *= 2.0f; + nextPt.y *= 2.0f; + + nextPt.x -= c_halfWin_x; + nextPt.y -= c_halfWin_y; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + status[get_group_id(0)] = 0; + return; + } + + float b1 = 0; + float b2 = 0; + + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float a = (nextPt.x + x + 0.5f); + float b = (nextPt.y + y + 0.5f); + + float I_val = I_patch[i][j]; + float J_val = read_imagef(J, sampler, (float2)(a, b)).x; + + float diff = (J_val - I_val) * 32.0f; + + b1 += diff * dIdx_patch[i][j]; + b2 += diff * dIdy_patch[i][j]; + } + } + + reduce2(b1, b2, smem1, smem2, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + b1 = smem1[0]; + b2 = smem2[0]; + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt.x += delta.x; + nextPt.y += delta.y; + + if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f) + break; + } + + float errval = 0.0f; + if (calcErr) + { + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float a = (nextPt.x + x + 0.5f); + float b = (nextPt.y + y + 0.5f); + + float I_val = I_patch[i][j]; + float J_val = read_imagef(J, sampler, (float2)(a, b)).x; + + float diff = J_val - I_val; + + errval += fabs((float)diff); + } + } + + reduce1(errval, smem1, tid); + } + + if (tid == 0) + { + nextPt.x += c_halfWin_x; + nextPt.y += c_halfWin_y; + + nextPts[get_group_id(0)] = nextPt; + + //if (calcErr && !GET_MIN_EIGENVALS) + // err[get_group_id(0)] = errval; + } +} +__kernel void lkSparse_C4_D5(image2d_t I, image2d_t J, + __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status/*, __global float* err*/, const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int cn, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr, char GET_MIN_EIGENVALS) +{ + __local float smem1[256]; + __local float smem2[256]; + __local float smem3[256]; + + int c_halfWin_x = (c_winSize_x - 1) / 2; + int c_halfWin_y = (c_winSize_y - 1) / 2; + + const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0); + + float2 prevPt = prevPts[get_group_id(0)]; + prevPt.x *= (1.0f / (1 << level)); + prevPt.y *= (1.0f / (1 << level)); + + if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) + { + if (level == 0 && tid == 0) + { + status[get_group_id(0)] = 0; + + //if (calcErr) + // err[get_group_id(0)] = 0; + } + + return; + } + + prevPt.x -= c_halfWin_x; + prevPt.y -= c_halfWin_y; + + // extract the patch from the first image, compute covariation matrix of derivatives + + float A11 = 0; + float A12 = 0; + float A22 = 0; + + float4 I_patch[21][21]; + float4 dIdx_patch[21][21]; + float4 dIdy_patch[21][21]; + + for (int yBase = get_local_id(1), i = 0; yBase < c_winSize_y; yBase += get_local_size(1), ++i) + { + for (int xBase = get_local_id(0), j = 0; xBase < c_winSize_x; xBase += get_local_size(0), ++j) + { + float x = (prevPt.x + xBase + 0.5f); + float y = (prevPt.y + yBase + 0.5f); + + I_patch[i][j] = read_imagef(I, sampler, (float2)(x, y)).x; + + float4 dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x); + + float4 dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x - + (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x); + + dIdx_patch[i][j] = dIdx; + dIdy_patch[i][j] = dIdy; + + A11 += (dIdx * dIdx).x + (dIdx * dIdx).y + (dIdx * dIdx).z; + A12 += (dIdx * dIdy).x + (dIdx * dIdy).y + (dIdx * dIdy).z; + A22 += (dIdy * dIdy).x + (dIdy * dIdy).y + (dIdy * dIdy).z; + } + } + + reduce3(A11, A12, A22, smem1, smem2, smem3, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + A11 = smem1[0]; + A12 = smem2[0]; + A22 = smem3[0]; + + float D = A11 * A22 - A12 * A12; + + //if (calcErr && GET_MIN_EIGENVALS && tid == 0) + // err[get_group_id(0)] = minEig; + + if (D < 1.192092896e-07f) + { + if (level == 0 && tid == 0) + status[get_group_id(0)] = 0; + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt = nextPts[get_group_id(0)]; + nextPt.x *= 2.0f; + nextPt.y *= 2.0f; + + nextPt.x -= c_halfWin_x; + nextPt.y -= c_halfWin_y; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) + { + if (tid == 0 && level == 0) + status[get_group_id(0)] = 0; + return; + } + + float b1 = 0; + float b2 = 0; + + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float a = (nextPt.x + x + 0.5f); + float b = (nextPt.y + y + 0.5f); + + float4 I_val = I_patch[i][j]; + float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x; + + float4 diff = (J_val - I_val) * 32.0f; + + b1 += (diff * dIdx_patch[i][j]).x + (diff * dIdx_patch[i][j]).y + (diff * dIdx_patch[i][j]).z; + b2 += (diff * dIdy_patch[i][j]).x + (diff * dIdy_patch[i][j]).y + (diff * dIdy_patch[i][j]).z; + } + } + + reduce2(b1, b2, smem1, smem2, tid); + barrier(CLK_LOCAL_MEM_FENCE); + + b1 = smem1[0]; + b2 = smem2[0]; + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt.x += delta.x; + nextPt.y += delta.y; + + if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f) + break; + } + + float errval = 0.0f; + if (calcErr) + { + for (int y = get_local_id(1), i = 0; y < c_winSize_y; y += get_local_size(1), ++i) + { + for (int x = get_local_id(0), j = 0; x < c_winSize_x; x += get_local_size(0), ++j) + { + float a = (nextPt.x + x + 0.5f); + float b = (nextPt.y + y + 0.5f); + + float4 I_val = I_patch[i][j]; + float4 J_val = read_imagef(J, sampler, (float2)(a, b)).x; + + float4 diff = J_val - I_val; + + errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z); + } + } + + reduce1(errval, smem1, tid); + } + + if (tid == 0) + { + nextPt.x += c_halfWin_x; + nextPt.y += c_halfWin_y; + + nextPts[get_group_id(0)] = nextPt; + + //if (calcErr && !GET_MIN_EIGENVALS) + // err[get_group_id(0)] = errval; + } +} + +__kernel void lkDense_C1_D0(image2d_t I, image2d_t J, __global float* u, int uStep, __global float* v, int vStep, __global const float* prevU, int prevUStep, __global const float* prevV, int prevVStep, + const int rows, const int cols, /*__global float* err, int errStep, int cn,*/ int c_winSize_x, int c_winSize_y, int c_iters, char calcErr) +{ + int c_halfWin_x = (c_winSize_x - 1) / 2; + int c_halfWin_y = (c_winSize_y - 1) / 2; + + const int patchWidth = get_local_size(0) + 2 * c_halfWin_x; + const int patchHeight = get_local_size(1) + 2 * c_halfWin_y; + + __local int smem[8192]; + + __local int* I_patch = smem; + __local int* dIdx_patch = I_patch + patchWidth * patchHeight; + __local int* dIdy_patch = dIdx_patch + patchWidth * patchHeight; + + const int xBase = get_group_id(0) * get_local_size(0); + const int yBase = get_group_id(1) * get_local_size(1); + + sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; + + for (int i = get_local_id(1); i < patchHeight; i += get_local_size(1)) + { + for (int j = get_local_id(0); j < patchWidth; j += get_local_size(0)) + { + float x = xBase - c_halfWin_x + j + 0.5f; + float y = yBase - c_halfWin_y + i + 0.5f; + + I_patch[i * patchWidth + j] = read_imagei(I, sampleri, (float2)(x, y)).x; + + // Sharr Deriv + + dIdx_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x+1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x - + (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x-1, y)).x + 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x); + + dIdy_patch[i * patchWidth + j] = 3 * read_imagei(I, sampleri, (float2)(x-1, y+1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y+1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y+1)).x - + (3 * read_imagei(I, sampleri, (float2)(x-1, y-1)).x + 10 * read_imagei(I, sampleri, (float2)(x, y-1)).x + 3 * read_imagei(I, sampleri, (float2)(x+1, y-1)).x); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + // extract the patch from the first image, compute covariation matrix of derivatives + + const int x = get_global_id(0); + const int y = get_global_id(1); + + if (x >= cols || y >= rows) + return; + + int A11i = 0; + int A12i = 0; + int A22i = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + + A11i += dIdx * dIdx; + A12i += dIdx * dIdy; + A22i += dIdy * dIdy; + } + } + + float A11 = A11i; + float A12 = A12i; + float A22 = A22i; + + float D = A11 * A22 - A12 * A12; + + //if (calcErr && GET_MIN_EIGENVALS) + // (err + y * errStep)[x] = minEig; + + if (D < 1.192092896e-07f) + { + //if (calcErr) + // err(y, x) = 3.402823466e+38f; + + return; + } + + D = 1.f / D; + + A11 *= D; + A12 *= D; + A22 *= D; + + float2 nextPt; + nextPt.x = x + prevU[y/2 * prevUStep / 4 + x/2] * 2.0f; + nextPt.y = y + prevV[y/2 * prevVStep / 4 + x/2] * 2.0f; + + for (int k = 0; k < c_iters; ++k) + { + if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) + { + //if (calcErr) + // err(y, x) = 3.402823466e+38f; + + return; + } + + int b1 = 0; + int b2 = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j]; + int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x; + + int diff = (iJ - iI) * 32; + + int dIdx = dIdx_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + int dIdy = dIdy_patch[(get_local_id(1) + i) * patchWidth + (get_local_id(0) + j)]; + + b1 += diff * dIdx; + b2 += diff * dIdy; + } + } + + float2 delta; + delta.x = A12 * b2 - A22 * b1; + delta.y = A12 * b1 - A11 * b2; + + nextPt.x += delta.x; + nextPt.y += delta.y; + + if (fabs(delta.x) < 0.01f && fabs(delta.y) < 0.01f) + break; + } + + u[y * uStep / 4 + x] = nextPt.x - x; + v[y * vStep / 4 + x] = nextPt.y - y; + + if (calcErr) + { + int errval = 0; + + for (int i = 0; i < c_winSize_y; ++i) + { + for (int j = 0; j < c_winSize_x; ++j) + { + int iI = I_patch[(get_local_id(1) + i) * patchWidth + get_local_id(0) + j]; + int iJ = read_imagei(J, sampler, (float2)(nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f)).x; + + errval += abs(iJ - iI); + } + } + + //err[y * errStep / 4 + x] = static_cast(errval) / (c_winSize_x * c_winSize_y); + } +} diff --git a/modules/ocl/src/matrix_operations.cpp b/modules/ocl/src/matrix_operations.cpp index a4ef54f940..0603cc64f4 100644 --- a/modules/ocl/src/matrix_operations.cpp +++ b/modules/ocl/src/matrix_operations.cpp @@ -916,6 +916,7 @@ oclMat cv::ocl::oclMat::reshape(int new_cn, int new_rows) const CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels"); hdr.cols = new_width; + hdr.wholecols = new_width; hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT); return hdr; diff --git a/modules/ocl/src/pyrlk.cpp b/modules/ocl/src/pyrlk.cpp new file mode 100644 index 0000000000..5cf15c60eb --- /dev/null +++ b/modules/ocl/src/pyrlk.cpp @@ -0,0 +1,597 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other GpuMaterials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or bpied warranties, including, but not limited to, the bpied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" + +using namespace std; +using namespace cv; +using namespace cv::ocl; + +#if !defined (HAVE_OPENCL) + +void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat&, const oclMat&, const oclMat&, oclMat&, oclMat&, oclMat*) { } +void cv::ocl::PyrLKOpticalFlow::dense(const oclMat&, const oclMat&, oclMat&, oclMat&, oclMat*) { } + +#else /* !defined (HAVE_OPENCL) */ + +namespace cv +{ + namespace ocl + { + ///////////////////////////OpenCL kernel strings/////////////////////////// + extern const char *pyrlk; + + } +} + +struct dim3 +{ + unsigned int x, y, z; +}; + +struct float2 +{ + float x, y; +}; + +struct int2 +{ + int x, y; +}; + +void calcSharrDeriv_run(const oclMat& src, oclMat& dx_buf, oclMat& dy_buf, oclMat& dIdx, oclMat& dIdy, int cn) +{ + Context *clCxt = src.clCxt; + + string kernelName = "calcSharrDeriv_vertical"; + + size_t localThreads[3] = { 32, 8, 1 }; + size_t globalThreads[3] = { src.cols, src.rows, 1}; + + vector > args; + args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&cn )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step )); + + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, src.channels(), src.depth()); + + kernelName = "calcSharrDeriv_horizontal"; + + vector > args2; + args2.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&src.cols )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&cn )); + args2.push_back( make_pair( sizeof(cl_mem), (void *)&dx_buf.data )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&dx_buf.step )); + args2.push_back( make_pair( sizeof(cl_mem), (void *)&dy_buf.data )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&dy_buf.step )); + args2.push_back( make_pair( sizeof(cl_mem), (void *)&dIdx.data )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&dIdx.step )); + args2.push_back( make_pair( sizeof(cl_mem), (void *)&dIdy.data )); + args2.push_back( make_pair( sizeof(cl_int), (void *)&dIdy.step )); + + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args2, src.channels(), src.depth()); +} + + +void cv::ocl::PyrLKOpticalFlow::calcSharrDeriv(const oclMat& src, oclMat& dIdx, oclMat& dIdy) +{ + CV_Assert(src.rows > 1 && src.cols > 1); + CV_Assert(src.depth() == CV_8U); + + const int cn = src.channels(); + + ensureSizeIsEnough(src.size(), CV_MAKETYPE(CV_16S, cn), dx_calcBuf_); + ensureSizeIsEnough(src.size(), CV_MAKETYPE(CV_16S, cn), dy_calcBuf_); + + calcSharrDeriv_run(src, dx_calcBuf_, dy_calcBuf_, dIdx, dIdy, cn); +} + +void cv::ocl::PyrLKOpticalFlow::buildImagePyramid(const oclMat& img0, vector& pyr, bool withBorder) +{ + pyr.resize(maxLevel + 1); + + Size sz = img0.size(); + + Mat img0Temp; + img0.download(img0Temp); + + Mat pyrTemp; + oclMat o; + + for (int level = 0; level <= maxLevel; ++level) + { + oclMat temp; + + if (withBorder) + { + temp.create(sz.height + winSize.height * 2, sz.width + winSize.width * 2, img0.type()); + } + else + { + ensureSizeIsEnough(sz, img0.type(), pyr[level]); + } + + if (level == 0) + pyr[level] = img0Temp; + else + pyrDown(pyr[level - 1], pyr[level]); + + if (withBorder) + copyMakeBorder(pyr[level], temp, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_REFLECT_101); + + sz = Size((sz.width + 1) / 2, (sz.height + 1) / 2); + + if (sz.width <= winSize.width || sz.height <= winSize.height) + { + maxLevel = level; + break; + } + } +} + +namespace +{ + void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch, bool isDeviceArch11) + { + winSize.width *= cn; + + if (winSize.width > 32 && winSize.width > 2 * winSize.height) + { + block.x = isDeviceArch11 ? 16 : 32; + block.y = 8; + } + else + { + block.x = 16; + block.y = isDeviceArch11 ? 8 : 16; + } + + patch.x = (winSize.width + block.x - 1) / block.x; + patch.y = (winSize.height + block.y - 1) / block.y; + + block.z = patch.z = 1; + } +} + +struct MultiplyScalar +{ + MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {} + double operator ()(double a) const + { + return (scale * a * val); + } + const double val; + const double scale; +}; + +void callF(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask) +{ + Mat srcTemp; + Mat dstTemp; + src.download(srcTemp); + dst.download(dstTemp); + + int i; + int j; + int k; + for(i = 0; i < srcTemp.rows; i++) + { + for(j = 0; j < srcTemp.cols; j++) + { + for(k = 0; k < srcTemp.channels(); k++) + { + ((float*)dstTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k] = (float)op(((float*)srcTemp.data)[srcTemp.channels() * (i * srcTemp.rows + j) + k]); + } + } + } + + dst = dstTemp; +} + +static inline bool isAligned(const unsigned char* ptr, size_t size) +{ + return reinterpret_cast(ptr) % size == 0; +} + +static inline bool isAligned(size_t step, size_t size) +{ + return step % size == 0; +} + +void callT(const oclMat& src, oclMat& dst, MultiplyScalar op, int mask) +{ + if (!isAligned(src.data, 4 * sizeof(double)) || !isAligned(src.step, 4 * sizeof(double)) || + !isAligned(dst.data, 4 * sizeof(double)) || !isAligned(dst.step, 4 * sizeof(double))) + { + callF(src, dst, op, mask); + return; + } + + Mat srcTemp; + Mat dstTemp; + src.download(srcTemp); + dst.download(dstTemp); + + int x_shifted; + + int i; + int j; + for(i = 0; i < srcTemp.rows; i++) + { + const double* srcRow = (const double*)srcTemp.data + i * srcTemp.rows; + double* dstRow = (double*)dstTemp.data + i * dstTemp.rows;; + + for(j = 0; j < srcTemp.cols; j++) + { + x_shifted = j * 4; + + if(x_shifted + 4 - 1 < srcTemp.cols) + { + dstRow[x_shifted ] = op(srcRow[x_shifted ]); + dstRow[x_shifted + 1] = op(srcRow[x_shifted + 1]); + dstRow[x_shifted + 2] = op(srcRow[x_shifted + 2]); + dstRow[x_shifted + 3] = op(srcRow[x_shifted + 3]); + } + else + { + for (int real_x = x_shifted; real_x < srcTemp.cols; ++real_x) + { + ((float*)dstTemp.data)[i * srcTemp.rows + real_x] = op(((float*)srcTemp.data)[i * srcTemp.rows + real_x]); + } + } + } + } +} + +void multiply(const oclMat& src1, double val, oclMat& dst, double scale = 1.0f); +void multiply(const oclMat& src1, double val, oclMat& dst, double scale) +{ + MultiplyScalar op(val, scale); + //if(src1.channels() == 1 && dst.channels() == 1) + //{ + // callT(src1, dst, op, 0); + //} + //else + //{ + callF(src1, dst, op, 0); + //} +} + +cl_mem bindTexture(const oclMat& mat, int depth, int channels) +{ + cl_mem texture; + cl_image_format format; + int err; + if(depth == 0) + { + format.image_channel_data_type = CL_UNSIGNED_INT8; + } + else if(depth == 5) + { + format.image_channel_data_type = CL_FLOAT; + } + if(channels == 1) + { + format.image_channel_order = CL_R; + } + else if(channels == 3) + { + format.image_channel_order = CL_RGB; + } + else if(channels == 4) + { + format.image_channel_order = CL_RGBA; + } +#if CL_VERSION_1_2 + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = mat.cols; + desc.image_height = mat.rows; + desc.image_depth = NULL; + desc.image_array_size = 1; + desc.image_row_pitch = 0; + desc.image_slice_pitch= 0; + desc.buffer = NULL; + desc.num_mip_levels = 0; + desc.num_samples = 0; + texture = clCreateImage(mat.clCxt->impl->clContext, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); +#else + texture = clCreateImage2D( + mat.clCxt->impl->clContext, + CL_MEM_READ_WRITE, + &format, + mat.cols, + mat.rows, + 0, + NULL, + &err); +#endif + size_t origin[] = { 0, 0, 0 }; + size_t region[] = { mat.cols, mat.rows, 1 }; + clEnqueueCopyBufferToImage(mat.clCxt->impl->clCmdQueue, (cl_mem)mat.data, texture, 0, origin, region, 0, NULL, 0); + openCLSafeCall(err); + + return texture; +} + +void lkSparse_run(oclMat& I, oclMat& J, + const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err, bool GET_MIN_EIGENVALS, int ptcount, + int level, dim3 block, dim3 patch, Size winSize, int iters) +{ + Context *clCxt = I.clCxt; + + string kernelName = "lkSparse"; + + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { 16 * ptcount, 16, 1}; + + int cn = I.channels(); + + bool calcErr; + if (err) + { + calcErr = true; + } + else + { + calcErr = false; + } + calcErr = true; + + cl_mem ITex = bindTexture(I, I.depth(), cn); + cl_mem JTex = bindTexture(J, J.depth(), cn); + + vector > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex )); + + args.push_back( make_pair( sizeof(cl_mem), (void *)&prevPts.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&prevPts.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&nextPts.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&nextPts.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&status.data )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&(err->data) )); + args.push_back( make_pair( sizeof(cl_int), (void *)&level )); + args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); + args.push_back( make_pair( sizeof(cl_int), (void *)&patch.x )); + args.push_back( make_pair( sizeof(cl_int), (void *)&patch.y )); + args.push_back( make_pair( sizeof(cl_int), (void *)&cn )); + args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width )); + args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height )); + args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); + args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); + args.push_back( make_pair( sizeof(cl_char), (void *)&GET_MIN_EIGENVALS )); + + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.channels(), I.depth()); +} + +void cv::ocl::PyrLKOpticalFlow::sparse(const oclMat& prevImg, const oclMat& nextImg, const oclMat& prevPts, oclMat& nextPts, oclMat& status, oclMat* err) +{ + if (prevPts.empty()) + { + nextPts.release(); + status.release(); + if (err) err->release(); + return; + } + + derivLambda = std::min(std::max(derivLambda, 0.0), 1.0); + + iters = std::min(std::max(iters, 0), 100); + + const int cn = prevImg.channels(); + + dim3 block, patch; + calcPatchSize(winSize, cn, block, patch, isDeviceArch11_); + + CV_Assert(derivLambda >= 0); + CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2); + CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); + CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6); + CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2); + + if (useInitialFlow) + CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2); + else + ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts); + + oclMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1); + oclMat temp2 = nextPts.reshape(1); + //oclMat scalar(temp1.rows, temp1.cols, temp1.type(), Scalar(1.0f / (1 << maxLevel) / 2.0f)); + //ocl::multiply(temp1, scalar, temp2); + ::multiply(temp1, 1.0f / (1 << maxLevel) / 2.0f, temp2); + + ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status); + status.setTo(Scalar::all(1)); + + if (err) + ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err); + + // build the image pyramids. + + prevPyr_.resize(maxLevel + 1); + nextPyr_.resize(maxLevel + 1); + + if (cn == 1 || cn == 4) + { + prevImg.convertTo(prevPyr_[0], CV_32F); + nextImg.convertTo(nextPyr_[0], CV_32F); + } + else + { + oclMat buf_; + cvtColor(prevImg, buf_, COLOR_BGR2BGRA); + buf_.convertTo(prevPyr_[0], CV_32F); + + cvtColor(nextImg, buf_, COLOR_BGR2BGRA); + buf_.convertTo(nextPyr_[0], CV_32F); + } + + for (int level = 1; level <= maxLevel; ++level) + { + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); + } + + // dI/dx ~ Ix, dI/dy ~ Iy + + for (int level = maxLevel; level >= 0; level--) + { + lkSparse_run(prevPyr_[level], nextPyr_[level], + prevPts, nextPts, status, level == 0 && err ? err : 0, getMinEigenVals, prevPts.cols, + level, block, patch, winSize, iters); + } +} + +void lkDense_run(oclMat& I, oclMat& J, oclMat& u, oclMat& v, + oclMat& prevU, oclMat& prevV, oclMat* err, Size winSize, int iters) +{ + Context *clCxt = I.clCxt; + + string kernelName = "lkDense"; + + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { I.cols, I.rows, 1}; + + int cn = I.channels(); + + bool calcErr; + if (err) + { + calcErr = true; + } + else + { + calcErr = false; + } + + cl_mem ITex = bindTexture(I, I.depth(), cn); + cl_mem JTex = bindTexture(J, J.depth(), cn); + + int2 halfWin = {(winSize.width - 1) / 2, (winSize.height - 1) / 2}; + const int patchWidth = 16 + 2 * halfWin.x; + const int patchHeight = 16 + 2 * halfWin.y; + size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); + + vector > args; + + args.push_back( make_pair( sizeof(cl_mem), (void *)&ITex )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&JTex )); + + args.push_back( make_pair( sizeof(cl_mem), (void *)&u.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&u.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&v.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&v.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&prevU.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&prevU.step )); + args.push_back( make_pair( sizeof(cl_mem), (void *)&prevV.data )); + args.push_back( make_pair( sizeof(cl_int), (void *)&prevV.step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&I.rows )); + args.push_back( make_pair( sizeof(cl_int), (void *)&I.cols )); + //args.push_back( make_pair( sizeof(cl_mem), (void *)&(*err).data )); + //args.push_back( make_pair( sizeof(cl_int), (void *)&(*err).step )); + args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.width )); + args.push_back( make_pair( sizeof(cl_int), (void *)&winSize.height )); + args.push_back( make_pair( sizeof(cl_int), (void *)&iters )); + args.push_back( make_pair( sizeof(cl_char), (void *)&calcErr )); + + openCLExecuteKernel(clCxt, &pyrlk, kernelName, globalThreads, localThreads, args, I.channels(), I.depth()); +} + +void cv::ocl::PyrLKOpticalFlow::dense(const oclMat& prevImg, const oclMat& nextImg, oclMat& u, oclMat& v, oclMat* err) +{ + CV_Assert(prevImg.type() == CV_8UC1); + CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type()); + CV_Assert(maxLevel >= 0); + CV_Assert(winSize.width > 2 && winSize.height > 2); + + if (err) + err->create(prevImg.size(), CV_32FC1); + + prevPyr_.resize(maxLevel + 1); + nextPyr_.resize(maxLevel + 1); + + prevPyr_[0] = prevImg; + nextImg.convertTo(nextPyr_[0], CV_32F); + + for (int level = 1; level <= maxLevel; ++level) + { + pyrDown(prevPyr_[level - 1], prevPyr_[level]); + pyrDown(nextPyr_[level - 1], nextPyr_[level]); + } + + ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); + ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); + uPyr_[1].setTo(Scalar::all(0)); + vPyr_[1].setTo(Scalar::all(0)); + + Size winSize2i(winSize.width, winSize.height); + + int idx = 0; + + for (int level = maxLevel; level >= 0; level--) + { + int idx2 = (idx + 1) & 1; + + lkDense_run(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2], + level == 0 ? err : 0, winSize2i, iters); + + if (level > 0) + idx = idx2; + } + + uPyr_[idx].copyTo(u); + vPyr_[idx].copyTo(v); +} + +#endif /* !defined (HAVE_CUDA) */ diff --git a/modules/ocl/test/test_pyrlk.cpp b/modules/ocl/test/test_pyrlk.cpp new file mode 100644 index 0000000000..e194642c22 --- /dev/null +++ b/modules/ocl/test/test_pyrlk.cpp @@ -0,0 +1,167 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "precomp.hpp" +#include + +#ifdef HAVE_OPENCL + +using namespace cv; +using namespace cv::ocl; +using namespace cvtest; +using namespace testing; +using namespace std; + +//#define DUMP + +///////////////////////////////////////////////////////////////////////////////////////////////// +// BroxOpticalFlow + +#define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin" +#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin" + + +///////////////////////////////////////////////////////////////////////////////////////////////// +// PyrLKOpticalFlow + +//IMPLEMENT_PARAM_CLASS(UseGray, bool) + +PARAM_TEST_CASE(Sparse, bool, bool) +{ + bool useGray; + bool UseSmart; + + virtual void SetUp() + { + UseSmart = GET_PARAM(0); + useGray = GET_PARAM(0); + } +}; + +TEST_P(Sparse, Mat) +{ + cv::Mat frame0 = readImage("../../../samples/gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + ASSERT_FALSE(frame0.empty()); + + cv::Mat frame1 = readImage("../../../samples/gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); + ASSERT_FALSE(frame1.empty()); + + cv::Mat gray_frame; + if (useGray) + gray_frame = frame0; + else + cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY); + + std::vector pts; + cv::goodFeaturesToTrack(gray_frame, pts, 1000, 0.01, 0.0); + + cv::ocl::oclMat d_pts; + cv::Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]); + d_pts.upload(pts_mat); + + cv::ocl::PyrLKOpticalFlow pyrLK; + + cv::ocl::oclMat oclFrame0; + cv::ocl::oclMat oclFrame1; + cv::ocl::oclMat d_nextPts; + cv::ocl::oclMat d_status; + cv::ocl::oclMat d_err; + + oclFrame0 = frame0; + oclFrame1 = frame1; + + pyrLK.sparse(oclFrame0, oclFrame1, d_pts, d_nextPts, d_status, &d_err); + + std::vector nextPts(d_nextPts.cols); + cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]); + d_nextPts.download(nextPts_mat); + + std::vector status(d_status.cols); + cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]); + d_status.download(status_mat); + + std::vector err(d_err.cols); + cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]); + d_err.download(err_mat); + + std::vector nextPts_gold; + std::vector status_gold; + std::vector err_gold; + cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, err_gold); + + ASSERT_EQ(nextPts_gold.size(), nextPts.size()); + ASSERT_EQ(status_gold.size(), status.size()); + + size_t mistmatch = 0; + for (size_t i = 0; i < nextPts.size(); ++i) + { + if (status[i] != status_gold[i]) + { + ++mistmatch; + continue; + } + + if (status[i]) + { + cv::Point2i a = nextPts[i]; + cv::Point2i b = nextPts_gold[i]; + + bool eq = std::abs(a.x - b.x) < 1 && std::abs(a.y - b.y) < 1; + //float errdiff = std::abs(err[i] - err_gold[i]); + float errdiff = 0.0f; + + if (!eq || errdiff > 1e-1) + ++mistmatch; + } + } + + double bad_ratio = static_cast(mistmatch) / (nextPts.size() * 2); + + ASSERT_LE(bad_ratio, 0.05f); + +} + +INSTANTIATE_TEST_CASE_P(Video, Sparse, Combine( + Values(false, true), + Values(false))); + +#endif // HAVE_OPENCL + -- GitLab