提交 1ab1fe02 编写于 作者: Z zhaojiaying01

add OpenCL depthwise3x3s1 kernel

上级 00a81db2
......@@ -233,5 +233,113 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
}
}
void DWConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu,
const framework::CLImage *biase,
const framework::CLImage *new_scale,
const framework::CLImage *new_bias) {
auto kernel = cl_helper->KernelAt(0);
auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 2;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
default_work_size[1] = w_blk;
auto input = param.Input()->GetCLImage();
auto filter = param.Filter()->GetCLImage();
auto output = param.Output()->GetCLImage();
int stride = param.Strides()[0];
int pad = param.Paddings()[0];
int dilation = param.Dilations()[0];
int input_channel = param.Input()->dims()[1];
int input_height = param.Input()->dims()[2];
int input_width = param.Input()->dims()[3];
int output_height = param.Output()->dims()[2];
int output_width = param.Output()->dims()[3];
// DLOG << " w " << w;
// DLOG << " nh " << nh;
// DLOG << " stride " << stride;
// DLOG << " dilation " << dilation;
// DLOG << " input width " << input_width;
// DLOG << " input height " << input_height;
// DLOG << " output width " << output_width;
// DLOG << " output height " << output_height;
// DLOG << " input dim " << param.Input()->dims();
// DLOG << " output dim " << param.Output()->dims();
// DLOG << " filter dim " << param.Filter()->dims();
cl_int status;
int index = 0;
status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
CL_CHECK_ERRORS(status);
if (biase) {
auto bias_mem = biase->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
CL_CHECK_ERRORS(status);
}
if (new_scale && new_bias) {
auto new_scale_mem = new_scale->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
CL_CHECK_ERRORS(status);
auto new_bias_mem = new_bias->GetCLImage();
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
CL_CHECK_ERRORS(status);
}
status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
} // namespace operators
} // namespace paddle_mobile
......@@ -41,6 +41,12 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
void DWConvAddBnRelu(framework::CLHelper *cl_helper,
const ConvParam<GPU_CL> &param, bool ifRelu = false,
const framework::CLImage *biase = nullptr,
const framework::CLImage *new_scale = nullptr,
const framework::CLImage *new_bias = nullptr);
} // namespace operators
} // namespace paddle_mobile
......
......@@ -583,6 +583,155 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0,
}
__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
__private const int ou_w_blk,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,/* of one block */
__private const int in_h, /* of one block */
__private const int ou_w,
__private const int ou_h) {
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
#endif
half4 inputs[12];
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y));
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
output[0] = mad(inputs[1], filters[1], output[0]);
output[1] = mad(inputs[2], filters[1], output[1]);
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]);
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]);
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]);
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
#endif
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
}
__kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
......
......@@ -157,12 +157,17 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
}
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
......@@ -207,6 +212,10 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
param.NewScale(), param.NewBias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -54,12 +54,17 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
}
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
......@@ -118,6 +123,9 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -55,12 +55,17 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
}
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
......@@ -122,6 +127,9 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -112,12 +112,17 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
}
} else if (param->Filter()->dims()[2] == 3 &&
param->Filter()->dims()[3] == 3) {
......@@ -161,6 +166,10 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias());
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
param.NewBias());
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -51,11 +51,15 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
}
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[2] == 3 &&
......@@ -100,6 +104,9 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
......@@ -52,12 +52,18 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
} else if (param->Filter()->dims()[1] == 1 &&
param->Input()->dims()[1] == param->Output()->dims()[1] &&
param->Filter()->dims()[2] == 3) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
param->Filter()->InitDWImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
build_options);
}
DLOG << "depth_conv 3x3";
} else if (param->Filter()->dims()[2] == 3 &&
......@@ -103,6 +109,9 @@ void ConvReluKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
ConvAddBnRelu(&this->cl_helper_, param, true);
break;
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, true);
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
param.ExecMode());
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册