From 23c56b1ea1fdd6c79fc6aaa04140a91d572987e8 Mon Sep 17 00:00:00 2001 From: Yuan Shuai Date: Sun, 15 Mar 2020 19:00:02 +0800 Subject: [PATCH] [LITE][OPENCL] Add lws for opencl conv image kernel (#3191) * [LITE][OPENCL] Change fp32 fc to fp16's. test=develop * fix act in conv3x3opt opencl kernel. test=develop * [LITE][OPENCL] fix opencl fc kernel. test=develop * [LITE][OPENCL] add lws for opencl conv image kernel. test=develop --- lite/backends/opencl/cl_context.cc | 29 +++++++++++++++ lite/backends/opencl/cl_context.h | 2 + lite/kernels/opencl/conv_image_compute.cc | 45 +++++++++++++++++++++-- lite/kernels/opencl/conv_image_compute.h | 1 + 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 77bd8cd404..f0105e060f 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -121,5 +121,34 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } +cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, + size_t max_work_size) { + int preferred_lws = 0; + int divisor = 2; + + auto tmp0 = global_work_size[0]; + auto tmp1 = global_work_size[1]; + auto tmp2 = global_work_size[2]; + + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (tmp1 > max_work_size && max_work_size > 0) { + tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + } + while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { + tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + } + while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { + tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + } + return cl::NDRange{static_cast(tmp0), + static_cast(tmp1), + static_cast(tmp2)}; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index a28f82f40e..1964c4bf56 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -44,6 +44,8 @@ class CLContext { cl::NDRange DefaultWorkSize(const CLImage &image); + cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); + private: std::unordered_map> programs_; std::vector> kernels_; diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index cda2b82568..8a6017d1ad 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -367,11 +367,24 @@ void ConvImageCompute::Conv2d1x1() { VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; + size_t max_work_group_size = 0; + kernel.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + cl::NDRange local_work_size = cl::NullRange; + VLOG(4) << "max_work_group_size: " << max_work_group_size; + if (max_work_group_size > 0 && use_lws) { + local_work_size = context.cl_context()->LocalWorkSize(global_work_size, + max_work_group_size); + VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," + << local_work_size[1] << "," << local_work_size[2] << "}"; + } + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_work_size, - cl::NullRange, + local_work_size, nullptr, event_.get()); CL_CHECK_FATAL(status); @@ -688,11 +701,24 @@ void ConvImageCompute::Conv2d3x3opt() { VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; + size_t max_work_group_size = 0; + kernel.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + cl::NDRange local_work_size = cl::NullRange; + VLOG(4) << "max_work_group_size: " << max_work_group_size; + if (max_work_group_size > 0 && use_lws) { + local_work_size = context.cl_context()->LocalWorkSize(global_work_size, + max_work_group_size); + VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," + << local_work_size[1] << "," << local_work_size[2] << "}"; + } + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_work_size, - cl::NullRange, + local_work_size, nullptr, event_.get()); CL_CHECK_FATAL(status); @@ -1068,11 +1094,24 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); CL_CHECK_FATAL(status); + size_t max_work_group_size = 0; + kernel.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + cl::NDRange local_work_size = cl::NullRange; + VLOG(4) << "max_work_group_size: " << max_work_group_size; + if (max_work_group_size > 0 && use_lws) { + local_work_size = context.cl_context()->LocalWorkSize(global_work_size, + max_work_group_size); + VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," + << local_work_size[1] << "," << local_work_size[2] << "}"; + } + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, global_work_size, - cl::NullRange, + local_work_size, nullptr, event_.get()); CL_CHECK_FATAL(status); diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index fccd2e92f8..3f8db82f4a 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -57,6 +57,7 @@ class ConvImageCompute : public KernelLite event_{new cl::Event}; Tensor filter_gpu_image_; Tensor bias_gpu_image_; + bool use_lws{true}; }; } // namespace opencl -- GitLab