From 9b5b32a168a24c3807d5ba11d9551ccd39e530e4 Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Wed, 6 Nov 2019 20:24:36 +0800 Subject: [PATCH] optimise conv 1x1 ... ,test=develop (#2388) optimise conv 1x1 ... ,test=develop (#2388) --- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 252 +++++++++++++----- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 9 +- .../operators/kernel/cl/conv_add_kernel.cpp | 9 +- .../kernel/cl/conv_add_relu_kernel.cpp | 9 +- .../kernel/cl/conv_bn_relu_kernel.cpp | 9 +- .../src/operators/kernel/cl/conv_kernel.cpp | 6 +- .../operators/kernel/cl/conv_relu_kernel.cpp | 8 +- 7 files changed, 227 insertions(+), 75 deletions(-) mode change 100755 => 100644 mobile/src/operators/kernel/cl/conv_kernel.cpp diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 2232cdc0a4..bdace5b540 100755 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -1006,8 +1006,160 @@ __kernel void conv_1x1(__private const int global_size_dim0, write_imageh(output_image, output_pos, output); } +__kernel void conv_1x1_simple( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c,__private const int input_c_origin, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w +) { + half zero = 0.0f; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int out_w0 = out_w; + int out_w1 = out_w + global_size_dim1; + int out_w2 = out_w + global_size_dim1 * 2; + int out_w3 = out_w + global_size_dim1 * 3; + + int outpos_main = mul24(out_c , old_w); + int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); + int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); + int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + + int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); + int2 in_pos_in_one_block0 = + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); + int2 in_pos_in_one_block1 = + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); + int2 in_pos_in_one_block2 = + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); -__kernel void conv_1x1_spl( + int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); + int2 in_pos_in_one_block3 = + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); + +#ifdef BIASE_CH + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; +#elif defined(BIASE_ELE) + half4 output0 = read_imageh(bias, sampler, output_pos0); + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; + +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + // ------------0--------------- + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + half4 input0 = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + // -------------1-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + half4 input1 = read_imageh(input_image, sampler, pos_in); + + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + + // -------------2-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + half4 input2 = read_imageh(input_image, sampler, pos_in); + + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + + // -------------3-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + half4 input3 = read_imageh(input_image, sampler, pos_in); + + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); +#endif + + if (out_w0 < old_w) { + write_imageh(output_image, output_pos0, output0); + } + + if (out_w1 < old_w){ + write_imageh(output_image, output_pos1, output1); + } + + if (out_w2 < old_w){ + write_imageh(output_image, output_pos2, output2); + } + + if (out_w3 < old_w){ + write_imageh(output_image, output_pos3, output3); + } +} +__kernel void conv_1x1_wrapped( __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter, @@ -1026,7 +1178,7 @@ __kernel void conv_1x1_spl( __private const int output_height, __private const int old_w ) { - + half zero = 0.0f; const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -1065,14 +1217,14 @@ __kernel void conv_1x1_spl( #ifdef BIASE_CH half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #elif defined(BIASE_ELE) half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = read_imageh(bias, sampler, output_pos1); - half4 output2 = read_imageh(bias, sampler, output_pos2); - half4 output3 = read_imageh(bias, sampler, output_pos3); + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #else half4 output0 = 0.0f; @@ -1083,6 +1235,10 @@ __kernel void conv_1x1_spl( int max_w_bound = input_c * input_width; int burndary_index = input_c * 4 - input_c_origin; + bool burndary_index_w = burndary_index==1||burndary_index==2||burndary_index==3; + bool burndary_index_z = burndary_index==2||burndary_index==3; + bool burndary_index_y = burndary_index==3; + for (int i = 0; i < input_c; ++i) { // ------------0--------------- int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); @@ -1092,22 +1248,13 @@ __kernel void conv_1x1_spl( half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - int bound_gap = max_w_bound - pos_in.x - 1; - if (bound_gap < input_width && bound_gap >= 0){ - if (burndary_index==0){ - // do nothing - } else if (burndary_index==1){ - input0.w = 0.0f; - } else if (burndary_index==2){ - input0.z = 0.0f; - input0.w = 0.0f; - } else if (burndary_index==3){ - input0.y = 0.0f; - input0.z = 0.0f; - input0.w = 0.0f; - } - } + + bool outof_bound = bound_gap < input_width && bound_gap >= 0; + input0.w = select(input0.w,zero,outof_bound && burndary_index_w); + input0.z = select(input0.z,zero,outof_bound && burndary_index_z); + input0.y = select(input0.y,zero,outof_bound && burndary_index_y); + output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); @@ -1117,20 +1264,12 @@ __kernel void conv_1x1_spl( half4 input1 = read_imageh(input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; - if (bound_gap < input_width && bound_gap >= 0){ - if (burndary_index==0){ - // do nothing - } else if (burndary_index==1){ - input1.w = 0.0f; - } else if (burndary_index==2){ - input1.z = 0.0f; - input1.w = 0.0f; - } else if (burndary_index==3){ - input1.y = 0.0f; - input1.z = 0.0f; - input1.w = 0.0f; - } - } + + outof_bound = bound_gap < input_width && bound_gap >= 0; + input1.w = select(input1.w,zero,outof_bound && burndary_index_w); + input1.z = select(input1.z,zero,outof_bound && burndary_index_z); + input1.y = select(input1.y,zero,outof_bound && burndary_index_y); + output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); @@ -1141,20 +1280,12 @@ __kernel void conv_1x1_spl( half4 input2 = read_imageh(input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; - if (bound_gap < input_width && bound_gap >= 0){ - if (burndary_index==0){ - // do nothing - } else if (burndary_index==1){ - input2.w = 0.0f; - } else if (burndary_index==2){ - input2.z = 0.0f; - input2.w = 0.0f; - } else if (burndary_index==3){ - input2.y = 0.0f; - input2.z = 0.0f; - input2.w = 0.0f; - } - } + + outof_bound = bound_gap < input_width && bound_gap >= 0; + input2.w = select(input2.w,zero,outof_bound && burndary_index_w); + input2.z = select(input2.z,zero,outof_bound && burndary_index_z); + input2.y = select(input2.y,zero,outof_bound && burndary_index_y); + output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); @@ -1164,20 +1295,11 @@ __kernel void conv_1x1_spl( pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); bound_gap = max_w_bound - pos_in.x - 1; - if (bound_gap < input_width && bound_gap >= 0){ - if (burndary_index==0){ - // do nothing - } else if (burndary_index==1){ - input3.w = 0.0f; - } else if (burndary_index==2){ - input3.z = 0.0f; - input3.w = 0.0f; - } else if (burndary_index==3){ - input3.y = 0.0f; - input3.z = 0.0f; - input3.w = 0.0f; - } - } + + outof_bound = bound_gap < input_width && bound_gap >= 0; + input3.w = select(input3.w,zero,outof_bound && (burndary_index==1||burndary_index==2||burndary_index==3)); + input3.z = select(input3.z,zero,outof_bound && (burndary_index==2||burndary_index==3)); + input3.y = select(input3.y,zero,outof_bound && burndary_index==3); output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 66e839c0fb..a3a469dc86 100644 --- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -152,8 +152,13 @@ bool ConvAddBNReluKernel::Init( param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options); - + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, + build_options); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, + build_options); + } } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp index 7422514228..a0e890a70b 100644 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp @@ -48,8 +48,13 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT; param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options); + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, + build_options); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, + build_options); + } } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp index a6b4af7231..77738fe34c 100644 --- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -50,8 +50,13 @@ bool ConvAddReluKernel::Init( param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options); - + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, + build_options); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, + build_options); + } } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp index a69280a32b..c8cb97c2e2 100644 --- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp @@ -107,8 +107,13 @@ bool ConvBNReluKernel::Init( param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options); - + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, + build_options); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, + build_options); + } } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp old mode 100755 new mode 100644 index 71c67d59ea..2859715b9c --- a/mobile/src/operators/kernel/cl/conv_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp @@ -45,7 +45,11 @@ bool ConvKernel::Init(ConvParam *param) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file); + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file); + } DLOG << "conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp index 1aedbeec7a..0e63ccb095 100644 --- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp @@ -46,7 +46,13 @@ bool ConvReluKernel::Init(FusionConvReluParam *param) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options); + if (param->Input()->dims()[1] % 4 == 0) { + this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file, + build_options); + } else { + this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file, + build_options); + } DLOG << "conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && -- GitLab