diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc index dbe487ba91d00c2de4c08edf140526d727bac6b5..0e0613237ba4d53bfab15ac267d12efc8451f88d 100644 --- a/lite/kernels/opencl/activation_image_compute.cc +++ b/lite/kernels/opencl/activation_image_compute.cc @@ -37,11 +37,12 @@ class ActivationComputeImageDefault } void PrepareForRun() override { - auto& context = ctx_->As(); act_param_ = param_.get_mutable(); int act_type = static_cast(act_param_->active_type); +#ifndef LITE_SHUTDOWN_LOG VLOG(1) << "ActivationTypeToStr(act_param_->active_type):" << ActivationTypeToStr(act_param_->active_type); +#endif switch (act_type) { case 1: kernel_func_name_ = "relu"; @@ -71,41 +72,68 @@ class ActivationComputeImageDefault LOG(FATAL) << "This act type:" << act_type << " doesn't support."; return; } +#ifndef LITE_SHUTDOWN_LOG VLOG(1) << "kernel_func_name_:" << kernel_func_name_; +#endif + + auto& context = ctx_->As(); context.cl_context()->AddKernel( kernel_func_name_, "image/activation_kernel.cl", build_options_); - } - - void Run() override { - auto& param = *param_.get_mutable(); - const auto& x_dims = param.X->dims(); - auto* x_img = param.X->data(); - auto image_shape = InitImageDimInfoWith(x_dims); - auto* out_img = param.Out->mutable_data( - image_shape["width"], image_shape["height"]); - const auto& y_dims = param.Out->dims(); // useless: check dim only - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + } - int arg_idx = 0; - cl_int status = kernel.setArg(arg_idx, *x_img); + void ReInitWhenNeeded() override { + act_param_ = param_.get_mutable(); + auto x_dims = act_param_->X->dims(); + if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims; + first_epoch_for_reinit_ = false; + + // compute image shape + paddle::lite::CLImageConverterDefault default_convertor; + x_img_shape_ = default_convertor.InitImageDimInfoWith( + act_param_->X->dims()); // w, h + out_img_shape_ = default_convertor.InitImageDimInfoWith( + act_param_->Out->dims()); // w, h + + // compute global work size + GetGlobalWorkSize(); + } + } + + void GetGlobalWorkSize() { + global_work_size_ = + cl::NDRange{static_cast(x_img_shape_[0]), + static_cast(x_img_shape_[1])}; + } + + void Run() override { + auto* x_img = act_param_->X->data(); + auto* out_img = act_param_->Out->mutable_data( + out_img_shape_[0], out_img_shape_[1]); + + auto kernel = kernel_; + cl_int status; + status = kernel.setArg(0, *x_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_img); + status = kernel.setArg(1, *out_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, threshold_); + status = kernel.setArg(2, threshold_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, scale_); + status = kernel.setArg(3, scale_); CL_CHECK_FATAL(status); #ifndef LITE_SHUTDOWN_LOG - VLOG(4) << TargetToStr(param.X->target()); - VLOG(4) << TargetToStr(param.Out->target()); - VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " - << image_shape["height"]; + const auto& x_dims = act_param_->X->dims(); + const auto& y_dims = act_param_->Out->dims(); // useless: check dim only + VLOG(4) << TargetToStr(act_param_->X->target()); + VLOG(4) << TargetToStr(act_param_->Out->target()); + VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " " + << x_img_shape_[1]; VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " @@ -115,13 +143,12 @@ class ActivationComputeImageDefault VLOG(4) << "kernel func name:" << kernel_func_name_; #endif - auto global_work_size = - cl::NDRange{static_cast(image_shape["width"]), - static_cast(image_shape["height"])}; + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -131,9 +158,18 @@ class ActivationComputeImageDefault private: param_t* act_param_{nullptr}; + DDim x_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); + DDim out_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); + DDim last_x_dims_; std::string kernel_func_name_{}; float threshold_{6.f}; float scale_{1.f}; + cl::Kernel kernel_; + bool first_epoch_for_reinit_{true}; + cl::NDRange global_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; std::string build_options_{"-DCL_DTYPE_half"}; std::shared_ptr event_{new cl::Event}; }; diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc index 6d0ebf638f0a8967e27a657131e1cac89967ee0b..b13326056682cbe81077e248dd477a8f698ca602 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.cc +++ b/lite/kernels/opencl/elementwise_add_image_compute.cc @@ -23,44 +23,82 @@ namespace lite { namespace kernels { namespace opencl { -void ElementwiseAddImageCompute::PrepareForRun() { - ele_param_ = param_.get_mutable(); - auto* x = ele_param_->X; - auto* y = ele_param_->Y; - auto axis = ele_param_->axis; +void ElementwiseAddImageCompute::PrepareForRun() {} - if (y->dims().size() == 4) { - kernel_func_name_ = "elementwise_add"; // y: ImageDefault - } else if (y->dims().size() == 1) { - if (axis == x->dims().size() - 1) { - kernel_func_name_ = "width_add"; // y: ImageDefault - } else if (axis == x->dims().size() - 3) { - kernel_func_name_ = "channel_add"; // y: ImageFolder +void ElementwiseAddImageCompute::ReInitWhenNeeded() { + ele_param_ = param_.get_mutable(); + auto x_dims = ele_param_->X->dims(); + if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims; + first_epoch_for_reinit_ = false; + + // choose kernel + auto* x = ele_param_->X; + auto* y = ele_param_->Y; + auto* out = ele_param_->Out; + auto axis = ele_param_->axis; + + if (y->dims().size() == 4) { + kernel_func_name_ = "elementwise_add"; // y: ImageDefault + } else if (y->dims().size() == 1) { + if (axis == x->dims().size() - 1) { + kernel_func_name_ = "width_add"; // y: ImageDefault + } else if (axis == x->dims().size() - 3) { + kernel_func_name_ = "channel_add"; // y: ImageFolder + } else { + LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis + << ", x->dims().size():" << x->dims().size() + << ", y->dims.size():" << y->dims().size(); + } } else { LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis << ", x->dims().size():" << x->dims().size() << ", y->dims.size():" << y->dims().size(); } - } else { - LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis - << ", x->dims().size():" << x->dims().size() - << ", y->dims.size():" << y->dims().size(); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_); + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + + // compute image shape + paddle::lite::CLImageConverterDefault default_convertor; + x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims()); // w, h + y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims()); + out_img_shape_ = + default_convertor.InitImageDimInfoWith(out->dims()); // w, h + + // compute global work size + GetGlobalWorkSize(); } - VLOG(1) << "kernel_func_name_:" << kernel_func_name_; +} - auto& context = ctx_->As(); - context.cl_context()->AddKernel( - kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_); +void ElementwiseAddImageCompute::GetGlobalWorkSize() { + global_work_size_ = cl::NDRange{static_cast(x_img_shape_[0]), + static_cast(x_img_shape_[1])}; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " " + << x_img_shape_[1]; +#endif } void ElementwiseAddImageCompute::Run() { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - auto* x = ele_param_->X; auto* y = ele_param_->Y; auto* out = ele_param_->Out; auto axis = ele_param_->axis; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + auto* x_img = x->data(); + auto* y_img = y->data(); + auto* out_img = out->mutable_data(out_img_shape_[0], + out_img_shape_[1]); #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); @@ -70,75 +108,53 @@ void ElementwiseAddImageCompute::Run() { VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "axis:" << axis; -#endif - - paddle::lite::CLImageConverterDefault default_convertor; - auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h - auto x_img_width = x_img_shape[0]; - auto x_img_height = x_img_shape[1]; - auto out_img_shape = - default_convertor.InitImageDimInfoWith(out->dims()); // w, h - auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims()); - - auto* x_img = x->data(); - auto* y_img = y->data(); - auto* out_img = out->mutable_data(out_img_shape[0], - out_img_shape[1]); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; - VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; - VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " " - << out_img_shape[1]; + VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1]; + VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1]; + VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " " + << out_img_shape_[1]; #endif - STL::stringstream kernel_key; - kernel_key << kernel_func_name_ << build_options_; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - - int arg_idx = 0; - auto y_dims = y->dims(); + cl_int status; + auto kernel = kernel_; if (y_dims.size() == 4) { - cl_int status = kernel.setArg(arg_idx, *x_img); + status = kernel.setArg(0, *x_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *y_img); + status = kernel.setArg(1, *y_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_img); + status = kernel.setArg(2, *out_img); CL_CHECK_FATAL(status); } else if (y_dims.size() == 1) { - if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { - int tensor_w = x->dims()[x->dims().size() - 1]; + if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) { + const int tensor_w = x_dims[x_dims.size() - 1]; #ifndef LITE_SHUTDOWN_LOG VLOG(4) << "tensor_w:" << tensor_w; #endif - cl_int status = kernel.setArg(arg_idx, *x_img); + status = kernel.setArg(0, *x_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *y_img); + status = kernel.setArg(1, *y_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_img); + status = kernel.setArg(2, *out_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(tensor_w)); + status = kernel.setArg(3, tensor_w); CL_CHECK_FATAL(status); } else { LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis - << ", x->dims().size():" << x->dims().size() - << ", y->dims.size():" << y->dims().size(); + << ", x->dims().size():" << x_dims.size() + << ", y->dims.size():" << y_dims.size(); } } else { LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis - << ", x->dims().size():" << x->dims().size() - << ", y->dims.size():" << y->dims().size(); + << ", x->dims().size():" << x_dims.size() + << ", y->dims.size():" << y_dims.size(); } - auto global_work_size = cl::NDRange{static_cast(x_img_width), - static_cast(x_img_height)}; -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; -#endif - auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h index 084f0fe7fb18f9abe3c6ef41f10a9e38e31a54fc..7e38f300430b6faf199976088ad0cef69f94b789 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.h +++ b/lite/kernels/opencl/elementwise_add_image_compute.h @@ -15,6 +15,7 @@ #include #include +#include #include "lite/backends/opencl/cl_half.h" #include "lite/core/kernel.h" #include "lite/operators/op_params.h" @@ -34,6 +35,10 @@ class ElementwiseAddImageCompute void PrepareForRun() override; + void ReInitWhenNeeded() override; + + void GetGlobalWorkSize(); + void Run() override; std::string doc() const override { @@ -42,8 +47,19 @@ class ElementwiseAddImageCompute protected: param_t* ele_param_{nullptr}; + DDim last_x_dims_; + DDim x_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); + DDim y_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); + DDim out_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); std::string kernel_func_name_{"elementwise_add"}; std::string build_options_{"-DCL_DTYPE_half"}; + bool first_epoch_for_reinit_{true}; + cl::Kernel kernel_; + cl::NDRange global_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; std::shared_ptr event_{new cl::Event}; }; diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc index dbdedd136ea6b8c6b06d02d4f6d893e4ea849e8a..d486e97bb0ca799dcaf671ad55d0cb76c6fac389 100644 --- a/lite/kernels/opencl/fc_buffer_compute.cc +++ b/lite/kernels/opencl/fc_buffer_compute.cc @@ -30,74 +30,96 @@ class FcCompute public: using param_t = operators::FcParam; - void PrepareForRun() override { - const auto& param = *param_.get_mutable(); - const auto x_dims = param.input->dims(); - const auto w_dims = param.w->dims(); - - CHECK_GE(x_dims.size(), 2UL); - CHECK_GE(w_dims.size(), 2UL); - CHECK_EQ(param.output->dims().size(), 2UL); - - m_ = x_dims.Slice(0, param.in_num_col_dims).production(); - k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); - n_ = w_dims[1]; - CHECK_EQ(k_, static_cast(w_dims[0])); - VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] - << " " << x_dims[3]; - VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2] - << " " << w_dims[3]; - VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_; + void PrepareForRun() override {} + void ReInitWhenNeeded() override { + fc_param_ = param_.get_mutable(); + const auto x_dims = fc_param_->input->dims(); + if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims; + first_epoch_for_reinit_ = false; + + // compute m,n,k + const auto w_dims = fc_param_->w->dims(); + CHECK_GE(x_dims.size(), 2UL); + CHECK_GE(w_dims.size(), 2UL); + CHECK_EQ(fc_param_->output->dims().size(), 2UL); + + m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production(); + k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production(); + n_ = w_dims[1]; + CHECK_EQ(k_, static_cast(w_dims[0])); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] + << " " << x_dims[3]; + VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2] + << " " << w_dims[3]; + VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_; +#endif + + // choose kernel + if (m_ == 1) { // gemv + kernel_func_name_ = "fc_gemv_1x4"; + } else { // gemm + kernel_func_name_ = "fc_gemm_4x4"; + } +#ifndef LITE_SHUTDOWN_LOG + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; +#endif + + if (fc_param_->activation_type == "relu") { + build_options_ += "-DRELU"; + } + + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "buffer/fc_kernel.cl", build_options_); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + + // compute global work size + GetGlobalWorkSize(); + } + } + + void GetGlobalWorkSize() { if (m_ == 1) { // gemv - kernel_func_name_ = "fc_gemv_1x4"; global_work_size_ = cl::NDRange{static_cast((n_ + 3) / 4)}; } else { // gemm - kernel_func_name_ = "fc_gemm_4x4"; global_work_size_ = cl::NDRange{static_cast((m_ + 3) / 4), static_cast((n_ + 3) / 4)}; } - VLOG(1) << "kernel_func_name_:" << kernel_func_name_; - - if (param.activation_type == "relu") { - build_options_ += "-DRELU"; - } - auto& context = ctx_->As(); - context.cl_context()->AddKernel( - kernel_func_name_, "buffer/fc_kernel.cl", build_options_); } void Run() override { - const auto& param = *param_.get_mutable(); - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - auto* x_buf = param.input->data(); - auto* w_buf = param.w->data(); - auto* bias_buf = param.bias->data(); + auto* x_buf = fc_param_->input->data(); + auto* w_buf = fc_param_->w->data(); + auto* bias_buf = fc_param_->bias->data(); auto* out_buf = - param.output->mutable_data(TARGET(kOpenCL)); - - STL::stringstream kernel_key; - kernel_key << kernel_func_name_ << build_options_; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + fc_param_->output->mutable_data(TARGET(kOpenCL)); + auto kernel = kernel_; cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, *x_buf); + status = kernel.setArg(0, *x_buf); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *w_buf); + status = kernel.setArg(1, *w_buf); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *bias_buf); + status = kernel.setArg(2, *bias_buf); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_buf); + status = kernel.setArg(3, *out_buf); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(m_)); + status = kernel.setArg(4, static_cast(m_)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(n_)); + status = kernel.setArg(5, static_cast(n_)); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(k_)); + status = kernel.setArg(6, static_cast(k_)); CL_CHECK_FATAL(status); + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, @@ -111,9 +133,13 @@ class FcCompute private: int m_, n_, k_; + param_t* fc_param_{nullptr}; std::string kernel_func_name_{}; std::string build_options_{"-DCL_DTYPE_float "}; + bool first_epoch_for_reinit_{true}; + DDim last_x_dims_; cl::NDRange global_work_size_; + cl::Kernel kernel_; std::shared_ptr event_{new cl::Event}; }; diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc index 243737a81331a7159834d30ccfb2fab181baeebe..8edfff8a5952680ae559e0bf78b798c0abc365f1 100644 --- a/lite/kernels/opencl/grid_sampler_image_compute.cc +++ b/lite/kernels/opencl/grid_sampler_image_compute.cc @@ -39,95 +39,116 @@ class GridSamplerImageCompute : public KernelLite(); - auto& context = ctx_->As(); context.cl_context()->AddKernel( kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_); - VLOG(4) << "kernel_func_name_:" << kernel_func_name_; + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << "kernel_key: " << kernel_key.str(); } - void Run() override { - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); + void ReInitWhenNeeded() override { + grid_param_ = param_.get_mutable(); + auto x_dims = grid_param_->x->dims(); + if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims; + first_epoch_for_reinit_ = false; + + // compute image shape + paddle::lite::CLImageConverterDefault default_convertor; + out_img_shape_ = + default_convertor.InitImageDimInfoWith(grid_param_->out->dims()); + + // compute global work size + GetGlobalWorkSize(); + } + } + void GetGlobalWorkSize() { + auto default_work_size = + DefaultWorkSize(grid_param_->out->dims(), + DDim(std::vector{ + static_cast(out_img_shape_[0]), + static_cast(out_img_shape_[1])})); + global_work_size_ = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2] / 4)}; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "default_work_size: " << default_work_size[0] << ", " + << default_work_size[1] << ", " << default_work_size[2]; + VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " " + << global_work_size_[1] << " " << global_work_size_[2]; +#endif + } + + void Run() override { auto* x = grid_param_->x; - auto* out = grid_param_->out; auto* grid = grid_param_->grid; + auto* out = grid_param_->out; + auto out_dims = out->dims(); - auto in_dims = x->dims(); + int out_height = out_dims[2]; + int out_width = out_dims[3]; + + auto* x_img = x->data(); + auto* grid_img = x->data(); + auto* out_img = out->mutable_data(out_img_shape_[0], + out_img_shape_[1]); #ifndef LITE_SHUTDOWN_LOG + auto in_dims = x->dims(); VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "out->dims():" << out_dims; -#endif - - auto out_image_shape = InitImageDimInfoWith(out_dims); - auto* x_img = x->data(); // VLOG(4) << "x_image: " << x_img; - - auto* grid_img = x->data(); // VLOG(4) << "grid_img: " << grid_img; - - auto* out_img = out->mutable_data( - out_image_shape["width"], out_image_shape["height"]); -#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image" << out_img; - VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " - << out_image_shape["height"]; + VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " " + << out_img_shape_[1]; #endif - STL::stringstream kernel_key; - kernel_key << kernel_func_name_ << build_options_; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - int arg_idx = 0; - int out_height = out_dims[2]; - int out_width = out_dims[3]; - auto default_work_size = - DefaultWorkSize(out_dims, - DDim(std::vector{ - static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])})); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "default_work_size: " << default_work_size[0] << ", " - << default_work_size[1] << ", " << default_work_size[2]; -#endif - cl_int status = kernel.setArg(arg_idx++, *x_img); + cl_int status; + auto kernel = kernel_; + status = kernel.setArg(0, *x_img); CL_CHECK_FATAL(status); - status = kernel.setArg(arg_idx++, *grid_img); + status = kernel.setArg(1, *grid_img); CL_CHECK_FATAL(status); - status = kernel.setArg(arg_idx++, *out_img); + status = kernel.setArg(2, *out_img); CL_CHECK_FATAL(status); - status = kernel.setArg(arg_idx++, out_height); + status = kernel.setArg(3, out_height); CL_CHECK_FATAL(status); - status = kernel.setArg(arg_idx++, out_width); + status = kernel.setArg(4, out_width); CL_CHECK_FATAL(status); - auto global_work_size = - cl::NDRange{static_cast(default_work_size[0]), - static_cast(default_work_size[1]), - static_cast(default_work_size[2] / 4)}; - + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_img, event_); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " - << global_work_size[1] << " " << global_work_size[2]; -#endif } protected: param_t* grid_param_{nullptr}; + bool first_epoch_for_reinit_{true}; + DDim last_x_dims_; + DDim out_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); std::string kernel_func_name_{"grid_sampler"}; + cl::Kernel kernel_; + cl::NDRange global_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; std::string build_options_{"-DCL_DTYPE_half"}; std::shared_ptr event_{new cl::Event}; }; diff --git a/lite/kernels/opencl/scale_image_compute.cc b/lite/kernels/opencl/scale_image_compute.cc index 5fd9a2b46b5ce3b0ad84449785f510d5f0391250..3535b4f8030ec8681320d85333c3d5b0cc7d4805 100644 --- a/lite/kernels/opencl/scale_image_compute.cc +++ b/lite/kernels/opencl/scale_image_compute.cc @@ -37,53 +37,64 @@ class ScaleComputeImage2D : public KernelLiteAs(); - VLOG(1) << "kernel_func_name_:" << kernel_func_name_; context.cl_context()->AddKernel( kernel_func_name_, "image/scale_kernel.cl", build_options_); + VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + kernel_ = context.cl_context()->GetKernel(kernel_key.str()); + } + + void ReInitWhenNeeded() override { + scale_param_ = param_.get_mutable(); + auto x_dims = scale_param_->x->dims(); + if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) || + first_epoch_for_reinit_) { + last_x_dims_ = x_dims; + first_epoch_for_reinit_ = false; + + // compute image shape + paddle::lite::CLImageConverterDefault default_convertor; + out_img_shape_ = + default_convertor.InitImageDimInfoWith(scale_param_->output->dims()); + + // compute global work size + GetGlobalWorkSize(); + } + } + + void GetGlobalWorkSize() { + global_work_size_ = + cl::NDRange{static_cast(out_img_shape_[0]), + static_cast(out_img_shape_[1])}; } void Run() override { - const auto& param = *param_.get_mutable(); - const auto& in_dims = param.x->dims(); - auto* x_img = param.x->data(); - const float scale = param.scale; - const float bias = param.bias; - - // LOG(INFO) << "x_image" << x_img; - auto out_image_shape = InitImageDimInfoWith(in_dims); -#ifndef LITE_SHUTDOWN_LOG - VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " " - << out_image_shape["height"]; -#endif - auto* out_img = param.output->mutable_data( - out_image_shape["width"], out_image_shape["height"]); - // LOG(INFO) << "out_image" << out_img; + auto* x_img = scale_param_->x->data(); + auto* out_img = scale_param_->output->mutable_data( + out_img_shape_[0], out_img_shape_[1]); + const float scale = scale_param_->scale; + const float bias = scale_param_->bias; auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_name_ << build_options_; - auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - - auto global_work_size = - cl::NDRange{static_cast(out_image_shape["width"]), - static_cast(out_image_shape["height"])}; + auto kernel = kernel_; cl_int status; - int arg_idx = 0; - status = kernel.setArg(arg_idx, *x_img); + status = kernel.setArg(0, *x_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_img); + status = kernel.setArg(1, *out_img); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, scale); + status = kernel.setArg(2, scale); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, bias); + status = kernel.setArg(3, bias); CL_CHECK_FATAL(status); status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, - global_work_size, + global_work_size_, cl::NullRange, nullptr, event_.get()); @@ -95,6 +106,15 @@ class ScaleComputeImage2D : public KernelLite event_{new cl::Event}; + + param_t* scale_param_{nullptr}; + cl::Kernel kernel_; + bool first_epoch_for_reinit_{true}; + DDim last_x_dims_; + DDim out_img_shape_ = DDim(std::vector( + {static_cast(1), static_cast(1)})); + cl::NDRange global_work_size_ = cl::NDRange{ + static_cast(1), static_cast(1), static_cast(1)}; }; } // namespace opencl