From 0c51266afca6a3b3f5a7b49ba0af588d3d3887d2 Mon Sep 17 00:00:00 2001 From: StarryRain <36948762+StarryRain@users.noreply.github.com> Date: Wed, 25 Sep 2019 14:40:07 +0800 Subject: [PATCH] get opencl kernel_work_group_size (#2123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * get opencl kernel_work_group_size and set local_work_group_size for conv and conv_trans kernel of GAN, test=develop * fix fpga CI error, test=develop * fix fpga CI error, test=develop * auto set local work group size according to devices * Optimal OPENCL conv_trans_7x7 Performance , test=develop * fix CI error, test=develop --- mobile/src/framework/cl/cl_engine.h | 12 ++ mobile/src/framework/cl/cl_helper.h | 3 + mobile/src/framework/cl/cl_scope.h | 4 + .../kernel/cl/cl-kernel-func/conv_func.cpp | 150 +++++++++++-- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 201 ++++++++++++++++++ .../operators/kernel/cl/conv_add_kernel.cpp | 5 +- 6 files changed, 352 insertions(+), 23 deletions(-) diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h index f5b1e3c2d2..a395f011c2 100644 --- a/mobile/src/framework/cl/cl_engine.h +++ b/mobile/src/framework/cl/cl_engine.h @@ -133,6 +133,18 @@ class CLEngine { free(max_work_item_sizes); return localWorkSizeInfo_; } + size_t GetKernelWorkSize(cl_kernel kernel) { + cl_int status; + size_t kernel_work_size = 0; + status = + clGetKernelWorkGroupInfo(kernel, devices_[0], CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &kernel_work_size, NULL); + if (status != CL_SUCCESS) { + return 0; + } + DLOG << "kernel_work_size: " << kernel_work_size; + return kernel_work_size; + } std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith( cl_context context, std::string file_name) { diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h index f072edd82b..09beb7c72f 100644 --- a/mobile/src/framework/cl/cl_helper.h +++ b/mobile/src/framework/cl/cl_helper.h @@ -54,6 +54,9 @@ class CLHelper { CLLocalWorkSizeInfo LocalWorkSizeInfo() { return scope_->LocalWorkSizeInfo(); } + size_t KernelWorkSize(cl_kernel kernel) { + return scope_->KernelWorkSize(kernel); + } std::vector DefaultWorkSize(const CLImage &image) { // n c h w diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h index ebe16b553a..643ce32b57 100644 --- a/mobile/src/framework/cl/cl_scope.h +++ b/mobile/src/framework/cl/cl_scope.h @@ -110,6 +110,10 @@ class CLScope { } CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; } + size_t KernelWorkSize(cl_kernel kernel) { + size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel); + return kernel_work_size; + } private: cl_int status_; diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index 666b384c85..4da7b62b1b 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -20,6 +20,8 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { bool use_lws = true; +int preferred_lws = 0; +int preferred_lws_divisor = 2; template <> void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper, @@ -155,9 +157,38 @@ void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper, } // DLOG<<"default_work_size"<CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + auto kernel_work_size = cl_helper->KernelWorkSize(kernel); + auto tmp0 = default_work_size.data()[0]; + auto tmp1 = default_work_size.data()[1]; + auto tmp2 = default_work_size.data()[2]; + int max_work_size = static_cast(kernel_work_size); + if (preferred_lws_divisor > 1) { + max_work_size /= preferred_lws_divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + const size_t local_work_size[3] = {static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; + if (max_work_size > 0 && use_lws) { + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), local_work_size, 0, NULL, NULL); + } else { + status = clEnqueueNDRangeKernel( + cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + } CL_CHECK_ERRORS(status); } @@ -274,10 +305,30 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, static_cast(maped_w), static_cast(default_work_size.data()[2])}; - if (work_size[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; + auto kernel_work_size = cl_helper->KernelWorkSize(kernel); + auto tmp0 = work_size[0]; + auto tmp1 = work_size[1]; + auto tmp2 = work_size[2]; + int max_work_size = static_cast(kernel_work_size); + if (preferred_lws_divisor > 1) { + max_work_size /= preferred_lws_divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + const size_t local_work_size[3] = {static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; + if (max_work_size > 0 && use_lws) { status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, work_size, local_work_size, 0, NULL, NULL); @@ -474,10 +525,30 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); CL_CHECK_ERRORS(status); - if (default_work_size.data()[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; + auto kernel_work_size = cl_helper->KernelWorkSize(kernel); + auto tmp0 = default_work_size.data()[0]; + auto tmp1 = default_work_size.data()[1]; + auto tmp2 = default_work_size.data()[2]; + int max_work_size = static_cast(kernel_work_size); + if (preferred_lws_divisor > 1) { + max_work_size /= preferred_lws_divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + const size_t local_work_size[3] = {static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; + if (max_work_size > 0 && use_lws) { status = clEnqueueNDRangeKernel( cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, default_work_size.data(), local_work_size, 0, NULL, NULL); @@ -520,7 +591,6 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper, int input_channel = param.Input()->dims()[1]; int input_height = param.Input()->dims()[2]; int input_width = param.Input()->dims()[3]; - int output_height = param.Output()->dims()[2]; int output_width = param.Output()->dims()[3]; @@ -583,10 +653,30 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); CL_CHECK_ERRORS(status); - if (default_work_size.data()[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; + auto kernel_work_size = cl_helper->KernelWorkSize(kernel); + auto tmp0 = default_work_size.data()[0]; + auto tmp1 = default_work_size.data()[1]; + auto tmp2 = default_work_size.data()[2]; + int max_work_size = static_cast(kernel_work_size); + if (preferred_lws_divisor > 1) { + max_work_size /= preferred_lws_divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + const size_t local_work_size[3] = {static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; + if (max_work_size > 0 && use_lws) { status = clEnqueueNDRangeKernel( cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, default_work_size.data(), local_work_size, 0, NULL, NULL); @@ -987,10 +1077,30 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); CL_CHECK_ERRORS(status); - if (default_work_size.data()[1] % 60 == 0 && use_lws) { - const size_t local_work_size[3] = {static_cast(1), - static_cast(60), - static_cast(1)}; + auto kernel_work_size = cl_helper->KernelWorkSize(kernel); + auto tmp0 = default_work_size.data()[0]; + auto tmp1 = default_work_size.data()[1]; + auto tmp2 = default_work_size.data()[2]; + int max_work_size = static_cast(kernel_work_size); + if (preferred_lws_divisor > 1) { + max_work_size /= preferred_lws_divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + const size_t local_work_size[3] = {static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; + if (max_work_size > 0 && use_lws) { status = clEnqueueNDRangeKernel( cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL, default_work_size.data(), local_work_size, 0, NULL, NULL); diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 35239e02e5..868296d3e7 100755 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -2262,6 +2262,207 @@ __kernel void conv_7x7Pt1x2(__private const int global_size_dim0, } } +// dilation == 1 +__kernel void conv_7x7spl(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM +__read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } +#ifdef BATCH_NORM + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } +#endif + +#ifdef RELU + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); + } +} + __kernel void conv_5x5(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp index 7b17595e54..7422514228 100644 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp @@ -95,8 +95,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { param->Filter()->InitCLImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_7x7Pt1x2", conv_kernel_file, - build_options); + this->cl_helper_.AddKernel("conv_7x7spl", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 5 && param->Filter()->dims()[3] == 5) { @@ -123,7 +122,7 @@ void ConvAddKernel::Compute( ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: - ConvAddBnReluPt1x2(&this->cl_helper_, param, false, param.Bias()); + SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); -- GitLab