提交 09359982 编写于 作者: B bitwangyaoyao

some optimizations to ocl::pyrDown, PyrLK and Canny

上级 494ae156
......@@ -45,6 +45,7 @@
#include <iomanip>
#include "precomp.hpp"
#include "mcwutil.hpp"
using namespace cv;
using namespace cv::ocl;
......@@ -237,7 +238,7 @@ void canny::calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_b
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
......@@ -272,7 +273,7 @@ void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
{
......@@ -300,7 +301,7 @@ void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, i
{
strcat(build_options, "-D L2GRAD");
}
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
}
void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh)
......@@ -331,7 +332,7 @@ void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int ro
string kernelName = "calcMap";
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols)
......@@ -351,7 +352,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, i
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols)
......@@ -381,7 +382,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, voi
args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
std::swap(st1, st2);
}
......@@ -406,7 +407,7 @@ void canny::getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols)
size_t globalThreads[3] = {cols, rows, 1};
size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
}
#endif // HAVE_OPENCL
......@@ -44,7 +44,7 @@
//M*/
#include "precomp.hpp"
#include "mcwutil.hpp"
using namespace cv;
using namespace cv::ocl;
using namespace std;
......@@ -1613,7 +1613,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( smem, (void *)NULL));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
......@@ -1641,7 +1641,7 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
......@@ -1675,7 +1675,7 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
......@@ -1706,7 +1706,7 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
......@@ -1738,7 +1738,7 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
static inline int divUp(int total, int grain)
......@@ -1772,7 +1772,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat& img,
......@@ -1802,7 +1802,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
......@@ -1834,7 +1834,7 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz
args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
}
#endif
此差异已折叠。
......@@ -45,6 +45,25 @@
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, float scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
if (x < cols && y < rows)
{
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset);
float data1 = *((__global float *)((__global char *)src1 + src1_index));
float tmp = data1 * scalar;
*((__global float *)((__global char *)dst + dst_index)) = tmp;
}
}
__kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
{
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "mcwutil.hpp"
#if defined (HAVE_OPENCL)
using namespace std;
namespace cv
{
namespace ocl
{
inline int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
// provide additional methods for the user to interact with the command queue after a task is fired
void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, char *build_options, FLUSH_MODE finish_mode)
{
//construct kernel name
//The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
//for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
stringstream idxStr;
if(channels != -1)
idxStr << "_C" << channels;
if(depth != -1)
idxStr << "_D" << depth;
kernelName += idxStr.str();
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
if ( localThreads != NULL)
{
globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads);
}
for(int i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
localThreads, 0, NULL, NULL));
switch(finish_mode)
{
case CLFINISH:
clFinish(clCxt->impl->clCmdQueue);
case CLFLUSH:
clFlush(clCxt->impl->clCmdQueue);
break;
case DISABLE:
default:
break;
}
openCLSafeCall(clReleaseKernel(kernel));
}
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode)
{
openCLExecuteKernel2(clCxt, source, kernelName, globalThreads, localThreads, args,
channels, depth, NULL, finish_mode);
}
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth, char *build_options, FLUSH_MODE finish_mode)
{
openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
build_options, finish_mode);
}
}//namespace ocl
}//namespace cv
#endif
\ No newline at end of file
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef _OPENCV_MCWUTIL_
#define _OPENCV_MCWUTIL_
#include "precomp.hpp"
#if defined (HAVE_OPENCL)
using namespace std;
namespace cv
{
namespace ocl
{
enum FLUSH_MODE
{
CLFINISH = 0,
CLFLUSH,
DISABLE
};
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
}//namespace ocl
}//namespace cv
#endif // HAVE_OPENCL
#endif //_OPENCV_MCWUTIL_
......@@ -66,7 +66,6 @@ namespace cv
//////////////////////////////////////////////////////////////////////////////
/////////////////////// add subtract multiply divide /////////////////////////
//////////////////////////////////////////////////////////////////////////////
template<typename T>
void pyrdown_run(const oclMat &src, const oclMat &dst)
{
......@@ -95,52 +94,14 @@ void pyrdown_run(const oclMat &src, const oclMat &dst)
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));
openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
}
void pyrdown_run(const oclMat &src, const oclMat &dst)
{
switch(src.depth())
{
case 0:
pyrdown_run<unsigned char>(src, dst);
break;
case 1:
pyrdown_run<char>(src, dst);
break;
case 2:
pyrdown_run<unsigned short>(src, dst);
break;
case 3:
pyrdown_run<short>(src, dst);
break;
case 4:
pyrdown_run<int>(src, dst);
break;
case 5:
pyrdown_run<float>(src, dst);
break;
case 6:
pyrdown_run<double>(src, dst);
break;
default:
break;
}
}
//////////////////////////////////////////////////////////////////////////////
// pyrDown
......@@ -148,11 +109,9 @@ void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
{
CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
//src.step = src.rows;
dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
dst.download_channels = src.download_channels;
dst.download_channels=src.download_channels;
pyrdown_run(src, dst);
}
......
此差异已折叠。
......@@ -118,9 +118,9 @@ TEST_P(Sparse, Mat)
cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
d_status.download(status_mat);
std::vector<float> err(d_err.cols);
cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
d_err.download(err_mat);
//std::vector<float> err(d_err.cols);
//cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
//d_err.download(err_mat);
std::vector<cv::Point2f> nextPts_gold;
std::vector<unsigned char> status_gold;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册