提交 23c56b1e 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Add lws for opencl conv image kernel (#3191)

* [LITE][OPENCL] Change fp32 fc to fp16's. test=develop

* fix act in conv3x3opt opencl kernel. test=develop

* [LITE][OPENCL] fix opencl fc kernel. test=develop

* [LITE][OPENCL] add lws for opencl conv image kernel. test=develop
上级 0f110d69
...@@ -121,5 +121,34 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { ...@@ -121,5 +121,34 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
} }
} }
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) {
int preferred_lws = 0;
int divisor = 2;
auto tmp0 = global_work_size[0];
auto tmp1 = global_work_size[1];
auto tmp2 = global_work_size[2];
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (tmp1 > max_work_size && max_work_size > 0) {
tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
}
while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
}
while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
}
return cl::NDRange{static_cast<size_t>(tmp0),
static_cast<size_t>(tmp1),
static_cast<size_t>(tmp2)};
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -44,6 +44,8 @@ class CLContext { ...@@ -44,6 +44,8 @@ class CLContext {
cl::NDRange DefaultWorkSize(const CLImage &image); cl::NDRange DefaultWorkSize(const CLImage &image);
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
private: private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_; std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
std::vector<std::unique_ptr<cl::Kernel>> kernels_; std::vector<std::unique_ptr<cl::Kernel>> kernels_;
......
...@@ -367,11 +367,24 @@ void ConvImageCompute::Conv2d1x1() { ...@@ -367,11 +367,24 @@ void ConvImageCompute::Conv2d1x1() {
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size[1] << "," << global_work_size[2] << "}";
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, local_work_size,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -688,11 +701,24 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -688,11 +701,24 @@ void ConvImageCompute::Conv2d3x3opt() {
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}"; << global_work_size[1] << "," << global_work_size[2] << "}";
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, local_work_size,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -1068,11 +1094,24 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1068,11 +1094,24 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2])); status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size,
cl::NullRange, local_work_size,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
......
...@@ -57,6 +57,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -57,6 +57,7 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_; Tensor filter_gpu_image_;
Tensor bias_gpu_image_; Tensor bias_gpu_image_;
bool use_lws{true};
}; };
} // namespace opencl } // namespace opencl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册