未验证 提交 44d98be8 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Improve kernel::Run/PrepareForRun of opencl kernel (#3302)

* [LITE][OPENCL] improve perf and support variable length for fc_buffer、elementwise_add、scale、activation、grid_sampler. test=develop
上级 942bc409
...@@ -37,11 +37,12 @@ class ActivationComputeImageDefault ...@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>(); act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type); int act_type = static_cast<int>(act_param_->active_type);
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "ActivationTypeToStr(act_param_->active_type):" VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
<< ActivationTypeToStr(act_param_->active_type); << ActivationTypeToStr(act_param_->active_type);
#endif
switch (act_type) { switch (act_type) {
case 1: case 1:
kernel_func_name_ = "relu"; kernel_func_name_ = "relu";
...@@ -71,41 +72,68 @@ class ActivationComputeImageDefault ...@@ -71,41 +72,68 @@ class ActivationComputeImageDefault
LOG(FATAL) << "This act type:" << act_type << " doesn't support."; LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return; return;
} }
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_); kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
int arg_idx = 0; void ReInitWhenNeeded() override {
cl_int status = kernel.setArg(arg_idx, *x_img); act_param_ = param_.get_mutable<param_t>();
auto x_dims = act_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->X->dims()); // w, h
out_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->Out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
}
void Run() override {
auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold_); status = kernel.setArg(2, threshold_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale_); status = kernel.setArg(3, scale_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target()); const auto& x_dims = act_param_->X->dims();
VLOG(4) << TargetToStr(param.Out->target()); const auto& y_dims = act_param_->Out->dims(); // useless: check dim only
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << TargetToStr(act_param_->X->target());
<< image_shape["height"]; VLOG(4) << TargetToStr(act_param_->Out->target());
VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
<< x_img_shape_[1];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
...@@ -115,13 +143,12 @@ class ActivationComputeImageDefault ...@@ -115,13 +143,12 @@ class ActivationComputeImageDefault
VLOG(4) << "kernel func name:" << kernel_func_name_; VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif #endif
auto global_work_size = auto& context = ctx_->As<OpenCLContext>();
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), CHECK(context.cl_context() != nullptr);
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -131,9 +158,18 @@ class ActivationComputeImageDefault ...@@ -131,9 +158,18 @@ class ActivationComputeImageDefault
private: private:
param_t* act_param_{nullptr}; param_t* act_param_{nullptr};
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim last_x_dims_;
std::string kernel_func_name_{}; std::string kernel_func_name_{};
float threshold_{6.f}; float threshold_{6.f};
float scale_{1.f}; float scale_{1.f};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -23,10 +23,20 @@ namespace lite { ...@@ -23,10 +23,20 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
void ElementwiseAddImageCompute::PrepareForRun() { void ElementwiseAddImageCompute::PrepareForRun() {}
void ElementwiseAddImageCompute::ReInitWhenNeeded() {
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
auto x_dims = ele_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// choose kernel
auto* x = ele_param_->X; auto* x = ele_param_->X;
auto* y = ele_param_->Y; auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis; auto axis = ele_param_->axis;
if (y->dims().size() == 4) { if (y->dims().size() == 4) {
...@@ -51,16 +61,44 @@ void ElementwiseAddImageCompute::PrepareForRun() { ...@@ -51,16 +61,44 @@ void ElementwiseAddImageCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_); kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
out_img_shape_ =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
} }
void ElementwiseAddImageCompute::Run() { void ElementwiseAddImageCompute::GetGlobalWorkSize() {
auto& context = ctx_->As<OpenCLContext>(); global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
CHECK(context.cl_context() != nullptr); static_cast<cl::size_type>(x_img_shape_[1])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
<< x_img_shape_[1];
#endif
}
void ElementwiseAddImageCompute::Run() {
auto* x = ele_param_->X; auto* x = ele_param_->X;
auto* y = ele_param_->Y; auto* y = ele_param_->Y;
auto* out = ele_param_->Out; auto* out = ele_param_->Out;
auto axis = ele_param_->axis; auto axis = ele_param_->axis;
auto x_dims = x->dims();
auto y_dims = y->dims();
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
...@@ -70,75 +108,53 @@ void ElementwiseAddImageCompute::Run() { ...@@ -70,75 +108,53 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis; VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<half_t, cl::Image2D>(); VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
auto* y_img = y->data<half_t, cl::Image2D>(); VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0], VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
out_img_shape[1]); << out_img_shape_[1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
#endif #endif
STL::stringstream kernel_key; cl_int status;
kernel_key << kernel_func_name_ << build_options_; auto kernel = kernel_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
if (y_dims.size() == 4) { if (y_dims.size() == 4) {
cl_int status = kernel.setArg(arg_idx, *x_img); status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img); status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1) { } else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1]; const int tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w; VLOG(4) << "tensor_w:" << tensor_w;
#endif #endif
cl_int status = kernel.setArg(arg_idx, *x_img); status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img); status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w)); status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else { } else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size() << ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y->dims().size(); << ", y->dims.size():" << y_dims.size();
} }
} else { } else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size() << ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y->dims().size(); << ", y->dims.size():" << y_dims.size();
} }
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width), auto& context = ctx_->As<OpenCLContext>();
static_cast<cl::size_type>(x_img_height)}; CHECK(context.cl_context() != nullptr);
#ifndef LITE_SHUTDOWN_LOG status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
...@@ -34,6 +35,10 @@ class ElementwiseAddImageCompute ...@@ -34,6 +35,10 @@ class ElementwiseAddImageCompute
void PrepareForRun() override; void PrepareForRun() override;
void ReInitWhenNeeded() override;
void GetGlobalWorkSize();
void Run() override; void Run() override;
std::string doc() const override { std::string doc() const override {
...@@ -42,8 +47,19 @@ class ElementwiseAddImageCompute ...@@ -42,8 +47,19 @@ class ElementwiseAddImageCompute
protected: protected:
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
DDim last_x_dims_;
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"elementwise_add"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
bool first_epoch_for_reinit_{true};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -30,74 +30,96 @@ class FcCompute ...@@ -30,74 +30,96 @@ class FcCompute
public: public:
using param_t = operators::FcParam; using param_t = operators::FcParam;
void PrepareForRun() override { void PrepareForRun() override {}
const auto& param = *param_.get_mutable<param_t>();
const auto x_dims = param.input->dims();
const auto w_dims = param.w->dims();
void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute m,n,k
const auto w_dims = fc_param_->w->dims();
CHECK_GE(x_dims.size(), 2UL); CHECK_GE(x_dims.size(), 2UL);
CHECK_GE(w_dims.size(), 2UL); CHECK_GE(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL); CHECK_EQ(fc_param_->output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production(); m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1]; n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0])); CHECK_EQ(k_, static_cast<int>(w_dims[0]));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3]; << " " << x_dims[3];
VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2] VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
<< " " << w_dims[3]; << " " << w_dims[3];
VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_; VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
#endif
// choose kernel
if (m_ == 1) { // gemv if (m_ == 1) { // gemv
kernel_func_name_ = "fc_gemv_1x4"; kernel_func_name_ = "fc_gemv_1x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm } else { // gemm
kernel_func_name_ = "fc_gemm_4x4"; kernel_func_name_ = "fc_gemm_4x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
} }
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
if (param.activation_type == "relu") { if (fc_param_->activation_type == "relu") {
build_options_ += "-DRELU"; build_options_ += "-DRELU";
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_); kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
} }
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto& context = ctx_->As<OpenCLContext>(); auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
CHECK(context.cl_context() != nullptr); auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* x_buf = param.input->data<float, cl::Buffer>();
auto* w_buf = param.w->data<float, cl::Buffer>();
auto* bias_buf = param.bias->data<float, cl::Buffer>();
auto* out_buf = auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto kernel = kernel_;
cl_int status; cl_int status;
int arg_idx = 0; status = kernel.setArg(0, *x_buf);
status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *w_buf); status = kernel.setArg(1, *w_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *bias_buf); status = kernel.setArg(2, *bias_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(3, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(m_)); status = kernel.setArg(4, static_cast<const int>(m_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(n_)); status = kernel.setArg(5, static_cast<const int>(n_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(k_)); status = kernel.setArg(6, static_cast<const int>(k_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -111,9 +133,13 @@ class FcCompute ...@@ -111,9 +133,13 @@ class FcCompute
private: private:
int m_, n_, k_; int m_, n_, k_;
param_t* fc_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float "}; std::string build_options_{"-DCL_DTYPE_float "};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
cl::NDRange global_work_size_; cl::NDRange global_work_size_;
cl::Kernel kernel_;
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -39,95 +39,116 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -39,95 +39,116 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
} }
void PrepareForRun() override { void PrepareForRun() override {
grid_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_); kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
VLOG(4) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
} }
void Run() override { void ReInitWhenNeeded() override {
auto& context = ctx_->As<OpenCLContext>(); grid_param_ = param_.get_mutable<param_t>();
CHECK(context.cl_context() != nullptr); auto x_dims = grid_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
auto default_work_size =
DefaultWorkSize(grid_param_->out->dims(),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_img_shape_[0]),
static_cast<int64_t>(out_img_shape_[1])}));
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
#endif
}
void Run() override {
auto* x = grid_param_->x; auto* x = grid_param_->x;
auto* out = grid_param_->out;
auto* grid = grid_param_->grid; auto* grid = grid_param_->grid;
auto* out = grid_param_->out;
auto out_dims = out->dims(); auto out_dims = out->dims();
auto in_dims = x->dims(); int out_height = out_dims[2];
int out_width = out_dims[3];
auto* x_img = x->data<half_t, cl::Image2D>();
auto* grid_img = x->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
auto in_dims = x->dims();
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims; VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img; // VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "grid_img: " << grid_img; // VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img; // VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_image_shape["height"]; << out_img_shape_[1];
#endif #endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; cl_int status;
int out_height = out_dims[2]; auto kernel = kernel_;
int out_width = out_dims[3]; status = kernel.setArg(0, *x_img);
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img); status = kernel.setArg(1, *grid_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_height); status = kernel.setArg(3, out_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_width); status = kernel.setArg(4, out_width);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = auto& context = ctx_->As<OpenCLContext>();
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]), CHECK(context.cl_context() != nullptr);
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_); context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
} }
protected: protected:
param_t* grid_param_{nullptr}; param_t* grid_param_{nullptr};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"grid_sampler"}; std::string kernel_func_name_{"grid_sampler"};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -37,53 +37,64 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -37,53 +37,64 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/scale_kernel.cl", build_options_); kernel_func_name_, "image/scale_kernel.cl", build_options_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
scale_param_ = param_.get_mutable<param_t>();
auto x_dims = scale_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
static_cast<cl::size_type>(out_img_shape_[1])};
} }
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
const auto& in_dims = param.x->dims(); auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
auto* x_img = param.x->data<half_t, cl::Image2D>(); out_img_shape_[0], out_img_shape_[1]);
const float scale = param.scale; const float scale = scale_param_->scale;
const float bias = param.bias; const float bias = scale_param_->bias;
// LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// LOG(INFO) << "out_image" << out_img;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
static_cast<cl::size_type>(out_image_shape["height"])};
auto kernel = kernel_;
cl_int status; cl_int status;
int arg_idx = 0; status = kernel.setArg(0, *x_img);
status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale); status = kernel.setArg(2, scale);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, bias); status = kernel.setArg(3, bias);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -95,6 +106,15 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -95,6 +106,15 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
std::string kernel_func_name_{"scale"}; std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
param_t* scale_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
}; };
} // namespace opencl } // namespace opencl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册