未验证 提交 44d98be8 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Improve kernel::Run/PrepareForRun of opencl kernel (#3302)

* [LITE][OPENCL] improve perf and support variable length for fc_buffer、elementwise_add、scale、activation、grid_sampler. test=develop
上级 942bc409
......@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type);
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
<< ActivationTypeToStr(act_param_->active_type);
#endif
switch (act_type) {
case 1:
kernel_func_name_ = "relu";
......@@ -71,41 +72,68 @@ class ActivationComputeImageDefault
LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return;
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
void ReInitWhenNeeded() override {
act_param_ = param_.get_mutable<param_t>();
auto x_dims = act_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->X->dims()); // w, h
out_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->Out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
}
void Run() override {
auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold_);
status = kernel.setArg(2, threshold_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale_);
status = kernel.setArg(3, scale_);
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
const auto& x_dims = act_param_->X->dims();
const auto& y_dims = act_param_->Out->dims(); // useless: check dim only
VLOG(4) << TargetToStr(act_param_->X->target());
VLOG(4) << TargetToStr(act_param_->Out->target());
VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
<< x_img_shape_[1];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
......@@ -115,13 +143,12 @@ class ActivationComputeImageDefault
VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -131,9 +158,18 @@ class ActivationComputeImageDefault
private:
param_t* act_param_{nullptr};
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim last_x_dims_;
std::string kernel_func_name_{};
float threshold_{6.f};
float scale_{1.f};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -23,44 +23,82 @@ namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseAddImageCompute::PrepareForRun() {
ele_param_ = param_.get_mutable<param_t>();
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto axis = ele_param_->axis;
void ElementwiseAddImageCompute::PrepareForRun() {}
if (y->dims().size() == 4) {
kernel_func_name_ = "elementwise_add"; // y: ImageDefault
} else if (y->dims().size() == 1) {
if (axis == x->dims().size() - 1) {
kernel_func_name_ = "width_add"; // y: ImageDefault
} else if (axis == x->dims().size() - 3) {
kernel_func_name_ = "channel_add"; // y: ImageFolder
void ElementwiseAddImageCompute::ReInitWhenNeeded() {
ele_param_ = param_.get_mutable<param_t>();
auto x_dims = ele_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// choose kernel
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
if (y->dims().size() == 4) {
kernel_func_name_ = "elementwise_add"; // y: ImageDefault
} else if (y->dims().size() == 1) {
if (axis == x->dims().size() - 1) {
kernel_func_name_ = "width_add"; // y: ImageDefault
} else if (axis == x->dims().size() - 3) {
kernel_func_name_ = "channel_add"; // y: ImageFolder
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
out_img_shape_ =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
void ElementwiseAddImageCompute::GetGlobalWorkSize() {
global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
<< x_img_shape_[1];
#endif
}
void ElementwiseAddImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
auto x_dims = x->dims();
auto y_dims = y->dims();
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
......@@ -70,75 +108,53 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_img_shape_[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
cl_int status;
auto kernel = kernel_;
if (y_dims.size() == 4) {
cl_int status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1];
if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
const int tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
cl_int status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
<< ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y_dims.size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
<< ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y_dims.size();
}
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......
......@@ -15,6 +15,7 @@
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h"
#include "lite/operators/op_params.h"
......@@ -34,6 +35,10 @@ class ElementwiseAddImageCompute
void PrepareForRun() override;
void ReInitWhenNeeded() override;
void GetGlobalWorkSize();
void Run() override;
std::string doc() const override {
......@@ -42,8 +47,19 @@ class ElementwiseAddImageCompute
protected:
param_t* ele_param_{nullptr};
DDim last_x_dims_;
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_half"};
bool first_epoch_for_reinit_{true};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -30,74 +30,96 @@ class FcCompute
public:
using param_t = operators::FcParam;
void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>();
const auto x_dims = param.input->dims();
const auto w_dims = param.w->dims();
CHECK_GE(x_dims.size(), 2UL);
CHECK_GE(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
<< " " << w_dims[3];
VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
void PrepareForRun() override {}
void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute m,n,k
const auto w_dims = fc_param_->w->dims();
CHECK_GE(x_dims.size(), 2UL);
CHECK_GE(w_dims.size(), 2UL);
CHECK_EQ(fc_param_->output->dims().size(), 2UL);
m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
<< " " << w_dims[3];
VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
#endif
// choose kernel
if (m_ == 1) { // gemv
kernel_func_name_ = "fc_gemv_1x4";
} else { // gemm
kernel_func_name_ = "fc_gemm_4x4";
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
if (fc_param_->activation_type == "relu") {
build_options_ += "-DRELU";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
kernel_func_name_ = "fc_gemv_1x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
kernel_func_name_ = "fc_gemm_4x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
if (param.activation_type == "relu") {
build_options_ += "-DRELU";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.input->data<float, cl::Buffer>();
auto* w_buf = param.w->data<float, cl::Buffer>();
auto* bias_buf = param.bias->data<float, cl::Buffer>();
auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_buf);
status = kernel.setArg(0, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *w_buf);
status = kernel.setArg(1, *w_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *bias_buf);
status = kernel.setArg(2, *bias_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(3, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
status = kernel.setArg(4, static_cast<const int>(m_));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
status = kernel.setArg(5, static_cast<const int>(n_));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
status = kernel.setArg(6, static_cast<const int>(k_));
CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......@@ -111,9 +133,13 @@ class FcCompute
private:
int m_, n_, k_;
param_t* fc_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float "};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
cl::NDRange global_work_size_;
cl::Kernel kernel_;
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -39,95 +39,116 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
}
void PrepareForRun() override {
grid_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
}
void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
void ReInitWhenNeeded() override {
grid_param_ = param_.get_mutable<param_t>();
auto x_dims = grid_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
auto default_work_size =
DefaultWorkSize(grid_param_->out->dims(),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_img_shape_[0]),
static_cast<int64_t>(out_img_shape_[1])}));
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
#endif
}
void Run() override {
auto* x = grid_param_->x;
auto* out = grid_param_->out;
auto* grid = grid_param_->grid;
auto* out = grid_param_->out;
auto out_dims = out->dims();
auto in_dims = x->dims();
int out_height = out_dims[2];
int out_width = out_dims[3];
auto* x_img = x->data<half_t, cl::Image2D>();
auto* grid_img = x->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG
auto in_dims = x->dims();
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_img_shape_[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
int out_height = out_dims[2];
int out_width = out_dims[3];
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
cl_int status;
auto kernel = kernel_;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img);
status = kernel.setArg(1, *grid_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_height);
status = kernel.setArg(3, out_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_width);
status = kernel.setArg(4, out_width);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
param_t* grid_param_{nullptr};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"grid_sampler"};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -37,53 +37,64 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/scale_kernel.cl", build_options_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
scale_param_ = param_.get_mutable<param_t>();
auto x_dims = scale_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
static_cast<cl::size_type>(out_img_shape_[1])};
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
auto* x_img = param.x->data<half_t, cl::Image2D>();
const float scale = param.scale;
const float bias = param.bias;
// LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// LOG(INFO) << "out_image" << out_img;
auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
const float scale = scale_param_->scale;
const float bias = scale_param_->bias;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
static_cast<cl::size_type>(out_image_shape["height"])};
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale);
status = kernel.setArg(2, scale);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, bias);
status = kernel.setArg(3, bias);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -95,6 +106,15 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_half"};
std::shared_ptr<cl::Event> event_{new cl::Event};
param_t* scale_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册