提交 5d9b0471 编写于 作者: S StarryRain 提交者: Yanzhan Yang

Optimal OPENCL conv_3x3 Performance (#1994)

* add opencl  depthwise_conv_trans_op

* test=develop

* Optimal OPENCL conv_3x3S2 Performance

* test=develop
上级 f38a741a
...@@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0, ...@@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0,
write_imageh(output_image, output_pos, output); write_imageh(output_image, output_pos, output);
} }
// dilation == 1 && stride == 1 && ou_nh == ou_h // dilation == 1
__kernel void conv_3x3s1(__private const int item_ch, __kernel void conv_3x3spl(__private const int item_ch,
__private const int item_w, __private const int item_w,
__private const int item_h, __private const int item_h,
__read_only image2d_t input_image, __read_only image2d_t input_image,
...@@ -456,14 +456,8 @@ __read_only image2d_t new_scale, ...@@ -456,14 +456,8 @@ __read_only image2d_t new_scale,
const int item_w_id = get_global_id(1); const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2); const int item_h_id = get_global_id(2);
// in_width_id_per_blk // out_width_id_per_blk and out_batch_id
int in_w_id0 = item_w_id - pad; int out_batch_id = item_h_id / in_h;
int in_w_id1 = in_w_id0 + item_w;
int in_w_id2 = in_w_id1 + item_w;
int in_w_id3 = in_w_id2 + item_w;
int in_w_id4 = in_w_id3 + item_w;
// out_width_id_per_blk
int out_w_base_id = item_ch_id * out_w; int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id; int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w; int out_w_id1 = out_w_id0 + item_w;
...@@ -471,6 +465,14 @@ __read_only image2d_t new_scale, ...@@ -471,6 +465,14 @@ __read_only image2d_t new_scale,
int out_w_id3 = out_w_id2 + item_w; int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w; int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH #ifdef BIASE_CH
half4 output[5]; half4 output[5];
...@@ -518,8 +520,8 @@ __read_only image2d_t new_scale, ...@@ -518,8 +520,8 @@ __read_only image2d_t new_scale,
for (int h = 0; h < 3; h++) { for (int h = 0; h < 3; h++) {
int in_h_val = select(item_h_id + h - pad, -1, int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h)); (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) { for (int w = 0; w < 3; w++) {
...@@ -539,7 +541,6 @@ __read_only image2d_t new_scale, ...@@ -539,7 +541,6 @@ __read_only image2d_t new_scale,
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
......
...@@ -82,17 +82,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) { ...@@ -82,17 +82,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
// winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
// //
// } else { // } else {
if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
} else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
}
// } // }
} else if (param->Filter()->dims()[2] == 7 && } else if (param->Filter()->dims()[2] == 7 &&
...@@ -123,7 +117,6 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -123,7 +117,6 @@ void ConvAddKernel<GPU_CL, float>::Compute(
WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias()); WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
break; break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
...@@ -132,7 +125,7 @@ void ConvAddKernel<GPU_CL, float>::Compute( ...@@ -132,7 +125,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT: case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break; break;
case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT: case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
break; break;
default: default:
......
...@@ -86,7 +86,8 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) { ...@@ -86,7 +86,8 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue()); cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
build_options);
} else { } else {
param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT; param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
param->Filter()->InitCLImage(cl_helper_.CLContext(), param->Filter()->InitCLImage(cl_helper_.CLContext(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册