From 8f6ba157b3366a4cf707d0c9b330edd176c0890a Mon Sep 17 00:00:00 2001 From: StarryRain <36948762+StarryRain@users.noreply.github.com> Date: Fri, 16 Aug 2019 15:14:41 +0800 Subject: [PATCH] add opencl slidingwindow3*3s1 (#1799) * add CPU_ARCH info, improve the performance of GEMM1*1s1 * improve the performance of gemm1*1s1_conv_add and gemm1*1s1_conv_add_bn_relu * improve the performance of slidingwindow_bn_relu,slidingwindow_add,slidingwindow_add_bn_relu,gemm1*1s1_bn_relu,gemm1*1s1_add_relu * add faster sgemv_notrans_mx1, fix test_fusion_op * add opencl slidingwindow3*3s1 --- .../kernel/cl/cl-kernel-func/conv_func.cpp | 139 ++++++++++++- .../kernel/cl/cl-kernel-func/conv_func.h | 6 + .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 196 ++++++++++++++++++ src/operators/kernel/cl/conv_add_kernel.cpp | 19 +- src/operators/kernel/cl/conv_relu_kernel.cpp | 19 +- 5 files changed, 363 insertions(+), 16 deletions(-) diff --git a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index bef8681376..40b7e5d97a 100644 --- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { +bool use_lws = true; template <> void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper, @@ -144,9 +145,18 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, static_cast(maped_w), static_cast(default_work_size.data()[2])}; - status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, - default_work_size.size(), NULL, work_size, - NULL, 0, NULL, NULL); + if (work_size[1] % 60 == 0 && use_lws) { + const size_t local_work_size[3] = {static_cast(1), + static_cast(60), + static_cast(1)}; + status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, + default_work_size.size(), NULL, work_size, + local_work_size, 0, NULL, NULL); + } else { + status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, + default_work_size.size(), NULL, work_size, + NULL, 0, NULL, NULL); + } CL_CHECK_ERRORS(status); } else { status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); @@ -335,11 +345,128 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + if (default_work_size.data()[1] % 60 == 0 && use_lws) { + const size_t local_work_size[3] = {static_cast(1), + static_cast(60), + static_cast(1)}; + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), local_work_size, 0, NULL, NULL); + } else { + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + } + CL_CHECK_ERRORS(status); } +void SWConvAddBnRelu(framework::CLHelper *cl_helper, + const ConvParam ¶m, bool ifRelu, + const framework::CLImage *biase, + const framework::CLImage *new_scale, + const framework::CLImage *new_bias) { + auto kernel = cl_helper->KernelAt(0); + auto default_work_size = cl_helper->DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + int w_blk_size = 5; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + default_work_size[1] = w_blk; + + int h_blk_size = 1; + int h_blk = (nh + h_blk_size - 1) / h_blk_size; + default_work_size[2] = h_blk; + + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int pad = param.Paddings()[0]; + int dilation = param.Dilations()[0]; + + int input_channel = param.Input()->dims()[1]; + int input_height = param.Input()->dims()[2]; + int input_width = param.Input()->dims()[3]; + + int output_height = param.Output()->dims()[2]; + int output_width = param.Output()->dims()[3]; + + cl_int status; + int index = 0; + + status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + if (biase) { + auto bias_mem = biase->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); + CL_CHECK_ERRORS(status); + } + + if (new_scale && new_bias) { + auto new_scale_mem = new_scale->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); + CL_CHECK_ERRORS(status); + + auto new_bias_mem = new_bias->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); + CL_CHECK_ERRORS(status); + } + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &pad); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + if (default_work_size.data()[1] % 60 == 0 && use_lws) { + const size_t local_work_size[3] = {static_cast(1), + static_cast(60), + static_cast(1)}; + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), local_work_size, 0, NULL, NULL); + } else { + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + } + CL_CHECK_ERRORS(status); +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/src/operators/kernel/cl/cl-kernel-func/conv_func.h index 08a62fde75..f8e0ca1720 100644 --- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h +++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h @@ -47,6 +47,12 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper, const framework::CLImage *new_scale = nullptr, const framework::CLImage *new_bias = nullptr); +void SWConvAddBnRelu(framework::CLHelper *cl_helper, + const ConvParam ¶m, bool ifRelu = false, + const framework::CLImage *biase = nullptr, + const framework::CLImage *new_scale = nullptr, + const framework::CLImage *new_bias = nullptr); + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 99efc933ae..464c955382 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -424,6 +424,202 @@ __kernel void conv_3x3(__private const int global_size_dim0, write_imageh(output_image, output_pos, output); } + // dilation == 1 && stride == 1 && ou_nh == ou_h +__kernel void conv_3x3s1(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM +__read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // in_width_id_per_blk + int in_w_id0 = item_w_id - pad; + int in_w_id1 = in_w_id0 + item_w; + int in_w_id2 = in_w_id1 + item_w; + int in_w_id3 = in_w_id2 + item_w; + int in_w_id4 = in_w_id3 + item_w; + + // out_width_id_per_blk + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + +#ifdef BIASE_CH + + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + + int in_h_val = select(item_h_id + h - pad, -1, + (item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h)); + + for (int w = 0; w < 3; w++) { + + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } +#ifdef BATCH_NORM + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } +#endif + +#ifdef RELU + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); + } +} diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 64929aaac0..8e21480b41 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -82,11 +82,17 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); // // } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); + if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); + } else { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); + } // } } else if (param->Filter()->dims()[2] == 7 && @@ -126,6 +132,9 @@ void ConvAddKernel::Compute( case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); diff --git a/src/operators/kernel/cl/conv_relu_kernel.cpp b/src/operators/kernel/cl/conv_relu_kernel.cpp index 5b9f1c7619..585b68f532 100644 --- a/src/operators/kernel/cl/conv_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_relu_kernel.cpp @@ -82,11 +82,17 @@ bool ConvReluKernel::Init(FusionConvReluParam *param) { // winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter()); // // } else { - param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; - param->Filter()->InitCLImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue()); - - this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); + if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT; + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options); + } else { + param->ExecMode() = ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT; + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options); + } // } DLOG << "conv 3x3"; @@ -112,6 +118,9 @@ void ConvReluKernel::Compute( case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: DWConvAddBnRelu(&this->cl_helper_, param, true); break; + case ConvParam::EXEC_SLIDINGWINDOW3x3S1_FLOAT: + SWConvAddBnRelu(&this->cl_helper_, param, true); + break; default: PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", param.ExecMode()); -- GitLab