提交 b50f52cd 编写于 作者: S StarryRain 提交者: Yanzhan Yang

Optimal OPENCL conv_3x3 Performance (#1994)

* add opencl  depthwise_conv_trans_op

* test=develop

* Optimal OPENCL conv_3x3S2 Performance

* test=develop
上级 58ab6c29
......@@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0,
write_imageh(output_image, output_pos, output);
}
// dilation == 1 && stride == 1 && ou_nh == ou_h
__kernel void conv_3x3s1(__private const int item_ch,
// dilation == 1
__kernel void conv_3x3spl(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
......@@ -456,14 +456,8 @@ __read_only image2d_t new_scale,
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// in_width_id_per_blk
int in_w_id0 = item_w_id - pad;
int in_w_id1 = in_w_id0 + item_w;
int in_w_id2 = in_w_id1 + item_w;
int in_w_id3 = in_w_id2 + item_w;
int in_w_id4 = in_w_id3 + item_w;
// out_width_id_per_blk
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
......@@ -471,6 +465,14 @@ __read_only image2d_t new_scale,
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
half4 output[5];
......@@ -518,8 +520,8 @@ __read_only image2d_t new_scale,
for (int h = 0; h < 3; h++) {
int in_h_val = select(item_h_id + h - pad, -1,
(item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h));
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
......@@ -539,7 +541,6 @@ __read_only image2d_t new_scale,
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
......
......@@ -82,17 +82,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
//
// } else {
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
}
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
// }
} else if (param->Filter()->dims()[2] == 7 &&
......@@ -123,7 +117,6 @@ void ConvAddKernel<GPU_CL, float>::Compute(
WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
......@@ -132,7 +125,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break;
default:
......
......@@ -86,7 +86,8 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册