提交 9851c008 编写于 作者: N NazgulLee 提交者: Yanzhan Yang

optimize conv7*7, compute 2 point in 1 thread. test=develop (#2028)

上级 8ab543ff
......@@ -32,6 +32,135 @@ void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {}
void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu,
const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {
auto kernel = cl_helper->KernelAt(0);
auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
default_work_size[1] = (default_work_size[1] + 1) / 2;
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int offset = param.Offset();
int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
param.Input()->Converter())
->GetCBlock();
int dilation = param.Dilations()[0];
int input_width = param.Input()->dims()[3];
int input_height = param.Input()->dims()[2];
int output_width = param.Output()->dims()[3];
int output_height = param.Output()->dims()[2];
int filter_channel = param.Filter()->dims()[1];
int input_channel = param.Input()->dims()[1];
//
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " offset " << offset;
// DLOG << " input_c " << input_c;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << param.Input()->dims();
// DLOG << " output dim " << param.Output()->dims();
// DLOG << " filter dim " << param.Filter()->dims();
cl_int status;
int index = 0;
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
if (filter_channel != input_channel) {
if (filter_channel != 1) {
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
CL_CHECK_ERRORS(status);
int has_group = 1;
status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
CL_CHECK_ERRORS(status);
}
} else {
status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
CL_CHECK_ERRORS(status);
int has_group = 0;
status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
CL_CHECK_ERRORS(status);
}
}
// DLOG<<"default_work_size"<<default_work_size[0]<<"
// "<<default_work_size[1]<<" "<<default_work_size[2];
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
void ConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu,
const framework::CLImage *biase,
......
......@@ -41,6 +41,12 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void DWConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu = false,
const framework::CLImage *biase = nullptr,
......
......@@ -2111,6 +2111,157 @@ __kernel void conv_7x7(__private const int global_size_dim0,
write_imageh(output_image, output_pos, output);
}
__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
half4 output0 = 0.0f;
half4 output1 = 0.0f;
#ifdef BIASE_CH
output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0;
#elif defined(BIASE_ELE)
output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
#else
output0 = 0.0f;
output1 = 0.0f;
#endif
half4 input[8];
half4 filter0[4];
half4 filter1[4];
half4 filter2[4];
half4 filter3[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int k = 0; k < 7; k++){
for (int j = 0; j < 8; j++) {
input[j] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output0.x += dot(input[j], filter0[0]);
output0.y += dot(input[j], filter0[1]);
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]);
}
if (j > 0) {
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]);
output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
}
}
#ifdef BATCH_NORM
half s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b;
output1 = output1 * s + b;
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
#endif
write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
}
}
__kernel void conv_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
......
......@@ -95,7 +95,8 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
this->cl_helper_.AddKernel("conv_7x7Pt1x2", conv_kernel_file,
build_options);
} else if (param->Filter()->dims()[2] == 5 &&
param->Filter()->dims()[3] == 5) {
......@@ -118,10 +119,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
ConvAddBnReluPt1x2(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册