提交 270b2c79 编写于 作者: Y yao

generating the lut table instead of hard coding one

上级 f1060ac5
......@@ -56,98 +56,6 @@ using namespace std;
static oclMat gauss_w_lut;
static bool hog_device_cpu;
/* pre-compute gaussian and interp_weight lookup tables if sigma is 4.0f */
static const float gaussian_interp_lut[] =
{
/* gaussian lut */
0.01831564f, 0.02926831f, 0.04393693f, 0.06196101f, 0.08208500f, 0.10215643f,
0.11943297f, 0.13117145f, 0.13533528f, 0.13117145f, 0.11943297f, 0.10215643f,
0.08208500f, 0.06196101f, 0.04393693f, 0.02926831f, 0.02926831f, 0.04677062f,
0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f, 0.19085334f, 0.20961139f,
0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f, 0.13117145f, 0.09901341f,
0.07021102f, 0.04677062f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
0.06196101f, 0.09901341f, 0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f,
0.40403652f, 0.44374731f, 0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f,
0.27768996f, 0.20961139f, 0.14863673f, 0.09901341f, 0.08208500f, 0.13117145f,
0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f, 0.53526145f, 0.58786964f,
0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f, 0.36787945f, 0.27768996f,
0.19691168f, 0.13117145f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
0.11943297f, 0.19085334f, 0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f,
0.77880079f, 0.85534531f, 0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f,
0.53526145f, 0.40403652f, 0.28650481f, 0.19085334f, 0.13117145f, 0.20961139f,
0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f, 0.85534531f, 0.93941307f,
0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f, 0.58786964f, 0.44374731f,
0.31466395f, 0.20961139f, 0.13533528f, 0.21626517f, 0.32465246f, 0.45783335f,
0.60653067f, 0.75483960f, 0.88249689f, 0.96923321f, 1.00000000f, 0.96923321f,
0.88249689f, 0.75483960f, 0.60653067f, 0.45783335f, 0.32465246f, 0.21626517f,
0.13117145f, 0.20961139f, 0.31466395f, 0.44374731f, 0.58786964f, 0.73161560f,
0.85534531f, 0.93941307f, 0.96923321f, 0.93941307f, 0.85534531f, 0.73161560f,
0.58786964f, 0.44374731f, 0.31466395f, 0.20961139f, 0.11943297f, 0.19085334f,
0.28650481f, 0.40403652f, 0.53526145f, 0.66614360f, 0.77880079f, 0.85534531f,
0.88249689f, 0.85534531f, 0.77880079f, 0.66614360f, 0.53526145f, 0.40403652f,
0.28650481f, 0.19085334f, 0.10215643f, 0.16324551f, 0.24506053f, 0.34559074f,
0.45783335f, 0.56978285f, 0.66614360f, 0.73161560f, 0.75483960f, 0.73161560f,
0.66614360f, 0.56978285f, 0.45783335f, 0.34559074f, 0.24506053f, 0.16324551f,
0.08208500f, 0.13117145f, 0.19691168f, 0.27768996f, 0.36787945f, 0.45783335f,
0.53526145f, 0.58786964f, 0.60653067f, 0.58786964f, 0.53526145f, 0.45783335f,
0.36787945f, 0.27768996f, 0.19691168f, 0.13117145f, 0.06196101f, 0.09901341f,
0.14863673f, 0.20961139f, 0.27768996f, 0.34559074f, 0.40403652f, 0.44374731f,
0.45783335f, 0.44374731f, 0.40403652f, 0.34559074f, 0.27768996f, 0.20961139f,
0.14863673f, 0.09901341f, 0.04393693f, 0.07021102f, 0.10539922f, 0.14863673f,
0.19691168f, 0.24506053f, 0.28650481f, 0.31466395f, 0.32465246f, 0.31466395f,
0.28650481f, 0.24506053f, 0.19691168f, 0.14863673f, 0.10539922f, 0.07021102f,
0.02926831f, 0.04677062f, 0.07021102f, 0.09901341f, 0.13117145f, 0.16324551f,
0.19085334f, 0.20961139f, 0.21626517f, 0.20961139f, 0.19085334f, 0.16324551f,
0.13117145f, 0.09901341f, 0.07021102f, 0.04677062f,
/* interp_weight lut */
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f, 0.01171875f, 0.03515625f,
0.05859375f, 0.08203125f, 0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f,
0.17578125f, 0.15234375f, 0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f,
0.03515625f, 0.01171875f, 0.01953125f, 0.05859375f, 0.09765625f, 0.13671875f,
0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f, 0.29296875f, 0.25390625f,
0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f, 0.05859375f, 0.01953125f,
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.03515625f, 0.10546875f,
0.17578125f, 0.24609375f, 0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f,
0.52734375f, 0.45703125f, 0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f,
0.10546875f, 0.03515625f, 0.04296875f, 0.12890625f, 0.21484375f, 0.30078125f,
0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f, 0.64453125f, 0.55859375f,
0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f, 0.12890625f, 0.04296875f,
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.05859375f, 0.17578125f,
0.29296875f, 0.41015625f, 0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f,
0.87890625f, 0.76171875f, 0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f,
0.17578125f, 0.05859375f, 0.05859375f, 0.17578125f, 0.29296875f, 0.41015625f,
0.52734375f, 0.64453125f, 0.76171875f, 0.87890625f, 0.87890625f, 0.76171875f,
0.64453125f, 0.52734375f, 0.41015625f, 0.29296875f, 0.17578125f, 0.05859375f,
0.05078125f, 0.15234375f, 0.25390625f, 0.35546875f, 0.45703125f, 0.55859375f,
0.66015625f, 0.76171875f, 0.76171875f, 0.66015625f, 0.55859375f, 0.45703125f,
0.35546875f, 0.25390625f, 0.15234375f, 0.05078125f, 0.04296875f, 0.12890625f,
0.21484375f, 0.30078125f, 0.38671875f, 0.47265625f, 0.55859375f, 0.64453125f,
0.64453125f, 0.55859375f, 0.47265625f, 0.38671875f, 0.30078125f, 0.21484375f,
0.12890625f, 0.04296875f, 0.03515625f, 0.10546875f, 0.17578125f, 0.24609375f,
0.31640625f, 0.38671875f, 0.45703125f, 0.52734375f, 0.52734375f, 0.45703125f,
0.38671875f, 0.31640625f, 0.24609375f, 0.17578125f, 0.10546875f, 0.03515625f,
0.02734375f, 0.08203125f, 0.13671875f, 0.19140625f, 0.24609375f, 0.30078125f,
0.35546875f, 0.41015625f, 0.41015625f, 0.35546875f, 0.30078125f, 0.24609375f,
0.19140625f, 0.13671875f, 0.08203125f, 0.02734375f, 0.01953125f, 0.05859375f,
0.09765625f, 0.13671875f, 0.17578125f, 0.21484375f, 0.25390625f, 0.29296875f,
0.29296875f, 0.25390625f, 0.21484375f, 0.17578125f, 0.13671875f, 0.09765625f,
0.05859375f, 0.01953125f, 0.01171875f, 0.03515625f, 0.05859375f, 0.08203125f,
0.10546875f, 0.12890625f, 0.15234375f, 0.17578125f, 0.17578125f, 0.15234375f,
0.12890625f, 0.10546875f, 0.08203125f, 0.05859375f, 0.03515625f, 0.01171875f,
0.00390625f, 0.01171875f, 0.01953125f, 0.02734375f, 0.03515625f, 0.04296875f,
0.05078125f, 0.05859375f, 0.05859375f, 0.05078125f, 0.04296875f, 0.03515625f,
0.02734375f, 0.01953125f, 0.01171875f, 0.00390625f
};
namespace cv
{
......@@ -180,7 +88,7 @@ namespace cv
int nblocks_win_x, int nblocks_win_y);
void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
int height, int width, float sigma, const cv::ocl::oclMat &grad,
int height, int width, const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle,
const cv::ocl::oclMat &gauss_w_lut, cv::ocl::oclMat &block_hists);
......@@ -328,10 +236,18 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
labels.create(1, wins_per_img.area(), CV_8U);
vector<float> v_lut = vector<float>(gaussian_interp_lut, gaussian_interp_lut +
sizeof(gaussian_interp_lut) / sizeof(gaussian_interp_lut[0]));
Mat m_lut(v_lut);
gauss_w_lut.upload(m_lut.reshape(1,1));
float sigma = getWinSigma();
float scale = 1.f / (2.f * sigma * sigma);
Mat gaussian_lut(1, 512, CV_32FC1);
int idx = 0;
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = std::exp(-(j * j + i * i) * scale);
for(int i=-8; i<8; i++)
for(int j=-8; j<8; j++)
gaussian_lut.at<float>(idx++) = (8.f - fabs(j + 0.5f)) * (8.f - fabs(i + 0.5f)) / 64.f;
gauss_w_lut.upload(gaussian_lut);
}
void cv::ocl::HOGDescriptor::computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle)
......@@ -358,7 +274,7 @@ void cv::ocl::HOGDescriptor::computeBlockHistograms(const oclMat &img)
computeGradient(img, this->grad, this->qangle);
hog::compute_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
effect_size.width, (float)getWinSigma(), grad, qangle, gauss_w_lut, block_hists);
effect_size.width, grad, qangle, gauss_w_lut, block_hists);
hog::normalize_hists(nbins, block_stride.width, block_stride.height, effect_size.height,
effect_size.width, block_hists, (float)threshold_L2hys);
......@@ -1708,7 +1624,7 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
void cv::ocl::device::hog::compute_hists(int nbins,
int block_stride_x, int block_stride_y,
int height, int width, float sigma,
int height, int width,
const cv::ocl::oclMat &grad,
const cv::ocl::oclMat &qangle,
const cv::ocl::oclMat &gauss_w_lut,
......@@ -1716,8 +1632,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
{
Context *clCxt = Context::getContext();
vector< pair<size_t, const void *> > args;
string kernelName = (sigma == 4.0f) ? "compute_hists_lut_kernel" :
"compute_hists_kernel";
string kernelName = "compute_hists_lut_kernel";
int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x)
/ block_stride_x;
......@@ -1728,9 +1643,6 @@ void cv::ocl::device::hog::compute_hists(int nbins,
int grad_quadstep = grad.step >> 2;
int qangle_step = qangle.step;
// Precompute gaussian spatial window parameter
float scale = 1.f / (2.f * sigma * sigma);
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
size_t globalThreads[3] = {
......@@ -1751,14 +1663,10 @@ void cv::ocl::device::hog::compute_hists(int nbins,
args.push_back( make_pair( sizeof(cl_int), (void *)&qangle_step));
args.push_back( make_pair( sizeof(cl_mem), (void *)&grad.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&qangle.data));
if (kernelName.compare("compute_hists_lut_kernel") == 0)
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
else
args.push_back( make_pair( sizeof(cl_float), (void *)&scale));
args.push_back( make_pair( sizeof(cl_mem), (void *)&gauss_w_lut.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
args.push_back( make_pair( smem, (void *)NULL));
if(hog_device_cpu)
{
openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads,
......
......@@ -53,7 +53,7 @@
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
// Use pre-computed gaussian and interp_weight lookup tables if sigma is 4.0f
// Use pre-computed gaussian and interp_weight lookup tables
__kernel void compute_hists_lut_kernel(
const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
......@@ -146,99 +146,6 @@ __kernel void compute_hists_lut_kernel(
}
}
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
__kernel void compute_hists_kernel(
const int cblock_stride_x, const int cblock_stride_y,
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int blocks_in_group, const int blocks_total,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
const float scale, __global float* block_hists, __local float* smem)
{
const int lx = get_local_id(0);
const int lp = lx / 24; /* local group id */
const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
const int gidY = gid / img_block_width;
const int gidX = gid - gidY * img_block_width;
const int lidX = lx - lp * 24;
const int lidY = get_local_id(1);
const int cell_x = lidX / 12;
const int cell_y = lidY;
const int cell_thread_x = lidX - cell_x * 12;
__local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
__local float* final_hist = hists + cnbins *
(CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
__global const float* grad_ptr = (gid < blocks_total) ?
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
__global const uchar* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
cell_thread_x;
for (int bin_id = 0; bin_id < cnbins; ++bin_id)
hist[bin_id * 48] = 0.f;
const int dist_x = -4 + cell_thread_x - 4 * cell_x;
const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
const int dist_y_begin = -4 - 4 * lidY;
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
float gaussian = exp(-(dist_center_y * dist_center_y + dist_center_x *
dist_center_x) * scale);
float interp_weight = (8.f - fabs(dist_y + 0.5f)) *
(8.f - fabs(dist_x + 0.5f)) / 64.f;
hist[bin.x * 48] += gaussian * interp_weight * vote.x;
hist[bin.y * 48] += gaussian * interp_weight * vote.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
volatile __local float* hist_ = hist;
for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
{
if (cell_thread_x < 6)
hist_[0] += hist_[6];
barrier(CLK_LOCAL_MEM_FENCE);
if (cell_thread_x < 3)
hist_[0] += hist_[3];
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (cell_thread_x == 0)
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
hist_[0] + hist_[1] + hist_[2];
}
#ifdef CPU
barrier(CLK_LOCAL_MEM_FENCE);
#endif
int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
if ((tid < cblock_hist_size) && (gid < blocks_total))
{
__global float* block_hist = block_hists +
(gidY * img_block_width + gidX) * cblock_hist_size;
block_hist[tid] = final_hist[tid];
}
}
//-------------------------------------------------------------
// Normalization of histograms via L2Hys_norm
// optimized for the case of 9 bins
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册