From 5d9b047190afcf5967c019b1d8bffa2f4ff1a0cf Mon Sep 17 00:00:00 2001 From: StarryRain <36948762+StarryRain@users.noreply.github.com> Date: Tue, 10 Sep 2019 10:32:54 +0800 Subject: [PATCH] Optimal OPENCL conv_3x3 Performance (#1994) * add opencl depthwise_conv_trans_op * test=develop * Optimal OPENCL conv_3x3S2 Performance * test=develop --- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 27 ++++++++++--------- .../operators/kernel/cl/conv_add_kernel.cpp | 19 +++++-------- .../operators/kernel/cl/conv_relu_kernel.cpp | 3 ++- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index b91be321a6..a292869c6f 100755 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0, write_imageh(output_image, output_pos, output); } - // dilation == 1 && stride == 1 && ou_nh == ou_h -__kernel void conv_3x3s1(__private const int item_ch, + // dilation == 1 +__kernel void conv_3x3spl(__private const int item_ch, __private const int item_w, __private const int item_h, __read_only image2d_t input_image, @@ -456,14 +456,8 @@ __read_only image2d_t new_scale, const int item_w_id = get_global_id(1); const int item_h_id = get_global_id(2); - // in_width_id_per_blk - int in_w_id0 = item_w_id - pad; - int in_w_id1 = in_w_id0 + item_w; - int in_w_id2 = in_w_id1 + item_w; - int in_w_id3 = in_w_id2 + item_w; - int in_w_id4 = in_w_id3 + item_w; - - // out_width_id_per_blk + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; int out_w_base_id = item_ch_id * out_w; int out_w_id0 = item_w_id; int out_w_id1 = out_w_id0 + item_w; @@ -471,6 +465,14 @@ __read_only image2d_t new_scale, int out_w_id3 = out_w_id2 + item_w; int out_w_id4 = out_w_id3 + item_w; + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + #ifdef BIASE_CH half4 output[5]; @@ -518,8 +520,8 @@ __read_only image2d_t new_scale, for (int h = 0; h < 3; h++) { - int in_h_val = select(item_h_id + h - pad, -1, - (item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h)); + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); for (int w = 0; w < 3; w++) { @@ -539,7 +541,6 @@ __read_only image2d_t new_scale, filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp index 8e21480b41..dc74cf0a6f 100644 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp @@ -82,17 +82,11 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); // // } else { - if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); - } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); - } + + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options); // } } else if (param->Filter()->dims()[2] == 7 && @@ -123,7 +117,6 @@ void ConvAddKernel::Compute( WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias()); break; case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: - case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: @@ -132,7 +125,7 @@ void ConvAddKernel::Compute( case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; - case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; default: diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp index 585b68f532..1aedbeec7a 100644 --- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp @@ -86,7 +86,8 @@ bool ConvReluKernel::Init(FusionConvReluParam *param) { param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; param->Filter()->InitCLImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); + this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, + build_options); } else { param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; param->Filter()->InitCLImage(cl_helper_.CLContext(), -- GitLab