[LITE][OPENCL] Improve kernel::Run/PrepareForRun of opencl kernel (#3302)

* [LITE][OPENCL] improve perf and support variable length for fc_buffer、elementwise_add、scale、activation、grid_sampler. test=develop

[LITE][OPENCL] Improve kernel::Run/PrepareForRun of opencl kernel (#3302)
* [LITE][OPENCL] improve perf and support variable length for fc_buffer、elementwise_add、scale、activation、grid_sampler. test=develop
44d98be8 · Yuan Shuai · GitHub · 942bc409 · 44d98be8 · 44d98be8
6 changed file
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
  }
  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
    act_param_ = param_.get_mutable<param_t>();
    int act_type = static_cast<int>(act_param_->active_type);
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
            << ActivationTypeToStr(act_param_->active_type);
+#endif
    switch (act_type) {
      case 1:
        kernel_func_name_ = "relu";
@@ -71,41 +72,68 @@ class ActivationComputeImageDefault
        LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
        return;
    }
+#ifndef LITE_SHUTDOWN_LOG
    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+#endif
+    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
        kernel_func_name_, "image/activation_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_img = param.X->data<half_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
-    int arg_idx = 0;
+  void ReInitWhenNeeded() override {
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+    act_param_ = param_.get_mutable<param_t>();
+    auto x_dims = act_param_->X->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      x_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->X->dims());  // w, h
+      out_img_shape_ = default_convertor.InitImageDimInfoWith(
+          act_param_->Out->dims());  // w, h
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
+                    static_cast<cl::size_type>(x_img_shape_[1])};
+  }
+  void Run() override {
+    auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
+    auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
+        out_img_shape_[0], out_img_shape_[1]);
+    auto kernel = kernel_;
+    cl_int status;
+    status = kernel.setArg(0, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold_);
+    status = kernel.setArg(2, threshold_);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale_);
+    status = kernel.setArg(3, scale_);
    CL_CHECK_FATAL(status);
 #ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << TargetToStr(param.X->target());
+    const auto& x_dims = act_param_->X->dims();
-    VLOG(4) << TargetToStr(param.Out->target());
+    const auto& y_dims = act_param_->Out->dims();  // useless: check dim only
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+    VLOG(4) << TargetToStr(act_param_->X->target());
-            << image_shape["height"];
+    VLOG(4) << TargetToStr(act_param_->Out->target());
+    VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
+            << x_img_shape_[1];
    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -115,13 +143,12 @@ class ActivationComputeImageDefault
    VLOG(4) << "kernel func name:" << kernel_func_name_;
 #endif
-    auto global_work_size =
+    auto& context = ctx_->As<OpenCLContext>();
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+    CHECK(context.cl_context() != nullptr);
-                    static_cast<cl::size_type>(image_shape["height"])};
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
-        global_work_size,
+        global_work_size_,
        cl::NullRange,
        nullptr,
        event_.get());
@@ -131,9 +158,18 @@ class ActivationComputeImageDefault
 private:
  param_t* act_param_{nullptr};
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim last_x_dims_;
  std::string kernel_func_name_{};
  float threshold_{6.f};
  float scale_{1.f};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -23,44 +23,82 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-void ElementwiseAddImageCompute::PrepareForRun() {
+void ElementwiseAddImageCompute::PrepareForRun() {}
-  ele_param_ = param_.get_mutable<param_t>();
-  auto* x = ele_param_->X;
-  auto* y = ele_param_->Y;
-  auto axis = ele_param_->axis;
-  if (y->dims().size() == 4) {
+void ElementwiseAddImageCompute::ReInitWhenNeeded() {
-    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+  ele_param_ = param_.get_mutable<param_t>();
-  } else if (y->dims().size() == 1) {
+  auto x_dims = ele_param_->X->dims();
-    if (axis == x->dims().size() - 1) {
+  if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
-      kernel_func_name_ = "width_add";  // y: ImageDefault
+      first_epoch_for_reinit_) {
-    } else if (axis == x->dims().size() - 3) {
+    last_x_dims_ = x_dims;
-      kernel_func_name_ = "channel_add";  // y: ImageFolder
+    first_epoch_for_reinit_ = false;
+    // choose kernel
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+    auto axis = ele_param_->axis;
+    if (y->dims().size() == 4) {
+      kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+    } else if (y->dims().size() == 1) {
+      if (axis == x->dims().size() - 1) {
+        kernel_func_name_ = "width_add";  // y: ImageDefault
+      } else if (axis == x->dims().size() - 3) {
+        kernel_func_name_ = "channel_add";  // y: ImageFolder
+      } else {
+        LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                   << ", x->dims().size():" << x->dims().size()
+                   << ", y->dims.size():" << y->dims().size();
+      }
    } else {
      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
                 << ", x->dims().size():" << x->dims().size()
                 << ", y->dims.size():" << y->dims().size();
    }
-  } else {
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
+    auto& context = ctx_->As<OpenCLContext>();
-               << ", y->dims.size():" << y->dims().size();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+    // compute image shape
+    paddle::lite::CLImageConverterDefault default_convertor;
+    x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
+    out_img_shape_ =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+    // compute global work size
+    GetGlobalWorkSize();
  }
-  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+}
-  auto& context = ctx_->As<OpenCLContext>();
+void ElementwiseAddImageCompute::GetGlobalWorkSize() {
-  context.cl_context()->AddKernel(
+  global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
-      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+                                  static_cast<cl::size_type>(x_img_shape_[1])};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
+          << x_img_shape_[1];
+#endif
 }
 void ElementwiseAddImageCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
  auto* x = ele_param_->X;
  auto* y = ele_param_->Y;
  auto* out = ele_param_->Out;
  auto axis = ele_param_->axis;
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                         out_img_shape_[1]);
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -70,75 +108,53 @@ void ElementwiseAddImageCompute::Run() {
  VLOG(4) << "y->dims():" << y->dims();
  VLOG(4) << "out->dims():" << out->dims();
  VLOG(4) << "axis:" << axis;
-#endif
-  paddle::lite::CLImageConverterDefault default_convertor;
-  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
-  auto x_img_width = x_img_shape[0];
-  auto x_img_height = x_img_shape[1];
-  auto out_img_shape =
-      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
-  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
-  auto* x_img = x->data<half_t, cl::Image2D>();
-  auto* y_img = y->data<half_t, cl::Image2D>();
-  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
-                                                         out_img_shape[1]);
-#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
-  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
-  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
-  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape_[1];
-          << out_img_shape[1];
 #endif
-  STL::stringstream kernel_key;
+  cl_int status;
-  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = kernel_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int arg_idx = 0;
-  auto y_dims = y->dims();
  if (y_dims.size() == 4) {
-    cl_int status = kernel.setArg(arg_idx, *x_img);
+    status = kernel.setArg(0, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_img);
+    status = kernel.setArg(1, *y_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(2, *out_img);
    CL_CHECK_FATAL(status);
  } else if (y_dims.size() == 1) {
-    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
+    if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
-      int tensor_w = x->dims()[x->dims().size() - 1];
+      const int tensor_w = x_dims[x_dims.size() - 1];
 #ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "tensor_w:" << tensor_w;
 #endif
-      cl_int status = kernel.setArg(arg_idx, *x_img);
+      status = kernel.setArg(0, *x_img);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *y_img);
+      status = kernel.setArg(1, *y_img);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_img);
+      status = kernel.setArg(2, *out_img);
      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      status = kernel.setArg(3, tensor_w);
      CL_CHECK_FATAL(status);
    } else {
      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-                 << ", x->dims().size():" << x->dims().size()
+                 << ", x->dims().size():" << x_dims.size()
-                 << ", y->dims.size():" << y->dims().size();
+                 << ", y->dims.size():" << y_dims.size();
    }
  } else {
    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x->dims().size()
+               << ", x->dims().size():" << x_dims.size()
-               << ", y->dims.size():" << y->dims().size();
+               << ", y->dims.size():" << y_dims.size();
  }
-  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+  auto& context = ctx_->As<OpenCLContext>();
-                                      static_cast<cl::size_type>(x_img_height)};
+  CHECK(context.cl_context() != nullptr);
-#ifndef LITE_SHUTDOWN_LOG
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
-#endif
-  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());

--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -15,6 +15,7 @@
 #include <memory>
 #include <string>
+#include <vector>
 #include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/op_params.h"
@@ -34,6 +35,10 @@ class ElementwiseAddImageCompute
  void PrepareForRun() override;
+  void ReInitWhenNeeded() override;
+  void GetGlobalWorkSize();
  void Run() override;
  std::string doc() const override {
@@ -42,8 +47,19 @@ class ElementwiseAddImageCompute
 protected:
  param_t* ele_param_{nullptr};
+  DDim last_x_dims_;
+  DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
  std::string kernel_func_name_{"elementwise_add"};
  std::string build_options_{"-DCL_DTYPE_half"};
+  bool first_epoch_for_reinit_{true};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -30,74 +30,96 @@ class FcCompute
 public:
  using param_t = operators::FcParam;
-  void PrepareForRun() override {
+  void PrepareForRun() override {}
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto x_dims = param.input->dims();
-    const auto w_dims = param.w->dims();
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_GE(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-    VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-            << " " << x_dims[3];
-    VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
-            << " " << w_dims[3];
-    VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+  void ReInitWhenNeeded() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    const auto x_dims = fc_param_->input->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute m,n,k
+      const auto w_dims = fc_param_->w->dims();
+      CHECK_GE(x_dims.size(), 2UL);
+      CHECK_GE(w_dims.size(), 2UL);
+      CHECK_EQ(fc_param_->output->dims().size(), 2UL);
+      m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
+      k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
+      n_ = w_dims[1];
+      CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+              << " " << x_dims[3];
+      VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
+              << " " << w_dims[3];
+      VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
+#endif
+      // choose kernel
+      if (m_ == 1) {  // gemv
+        kernel_func_name_ = "fc_gemv_1x4";
+      } else {  // gemm
+        kernel_func_name_ = "fc_gemm_4x4";
+      }
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+#endif
+      if (fc_param_->activation_type == "relu") {
+        build_options_ += "-DRELU";
+      }
+      auto& context = ctx_->As<OpenCLContext>();
+      context.cl_context()->AddKernel(
+          kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
+      STL::stringstream kernel_key;
+      kernel_key << kernel_func_name_ << build_options_;
+      kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+  void GetGlobalWorkSize() {
    if (m_ == 1) {  // gemv
-      kernel_func_name_ = "fc_gemv_1x4";
      global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
    } else {  // gemm
-      kernel_func_name_ = "fc_gemm_4x4";
      global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                      static_cast<size_t>((n_ + 3) / 4)};
    }
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    if (param.activation_type == "relu") {
-      build_options_ += "-DRELU";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
  }
  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
+    auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
-    auto& context = ctx_->As<OpenCLContext>();
+    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
-    CHECK(context.cl_context() != nullptr);
+    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
-    auto* x_buf = param.input->data<float, cl::Buffer>();
-    auto* w_buf = param.w->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias->data<float, cl::Buffer>();
    auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    auto kernel = kernel_;
    cl_int status;
-    int arg_idx = 0;
+    status = kernel.setArg(0, *x_buf);
-    status = kernel.setArg(arg_idx, *x_buf);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *w_buf);
+    status = kernel.setArg(1, *w_buf);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
+    status = kernel.setArg(2, *bias_buf);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(3, *out_buf);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
+    status = kernel.setArg(4, static_cast<const int>(m_));
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
+    status = kernel.setArg(5, static_cast<const int>(n_));
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
+    status = kernel.setArg(6, static_cast<const int>(k_));
    CL_CHECK_FATAL(status);
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
@@ -111,9 +133,13 @@ class FcCompute
 private:
  int m_, n_, k_;
+  param_t* fc_param_{nullptr};
  std::string kernel_func_name_{};
  std::string build_options_{"-DCL_DTYPE_float "};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
  cl::NDRange global_work_size_;
+  cl::Kernel kernel_;
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -39,95 +39,116 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
  }
  void PrepareForRun() override {
-    grid_param_ = param_.get_mutable<param_t>();
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
        kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
-    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << "kernel_key: " << kernel_key.str();
  }
-  void Run() override {
+  void ReInitWhenNeeded() override {
-    auto& context = ctx_->As<OpenCLContext>();
+    grid_param_ = param_.get_mutable<param_t>();
-    CHECK(context.cl_context() != nullptr);
+    auto x_dims = grid_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+  void GetGlobalWorkSize() {
+    auto default_work_size =
+        DefaultWorkSize(grid_param_->out->dims(),
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_img_shape_[0]),
+                            static_cast<int64_t>(out_img_shape_[1])}));
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2] / 4)};
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
+            << global_work_size_[1] << " " << global_work_size_[2];
+#endif
+  }
+  void Run() override {
    auto* x = grid_param_->x;
-    auto* out = grid_param_->out;
    auto* grid = grid_param_->grid;
+    auto* out = grid_param_->out;
    auto out_dims = out->dims();
-    auto in_dims = x->dims();
+    int out_height = out_dims[2];
+    int out_width = out_dims[3];
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* grid_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
+                                                           out_img_shape_[1]);
 #ifndef LITE_SHUTDOWN_LOG
+    auto in_dims = x->dims();
    VLOG(4) << "x->target():" << TargetToStr(x->target());
    VLOG(4) << "out->target():" << TargetToStr(out->target());
    VLOG(4) << "x->dims():" << in_dims;
    VLOG(4) << "out->dims():" << out_dims;
-#endif
-    auto out_image_shape = InitImageDimInfoWith(out_dims);
-    auto* x_img = x->data<half_t, cl::Image2D>();
    // VLOG(4) << "x_image: " << x_img;
-    auto* grid_img = x->data<half_t, cl::Image2D>();
    // VLOG(4) << "grid_img: " << grid_img;
-    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-#ifndef LITE_SHUTDOWN_LOG
    // VLOG(4) << "out_image" << out_img;
-    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
+    VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
-            << out_image_shape["height"];
+            << out_img_shape_[1];
 #endif
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
+    cl_int status;
-    int out_height = out_dims[2];
+    auto kernel = kernel_;
-    int out_width = out_dims[3];
+    status = kernel.setArg(0, *x_img);
-    auto default_work_size =
-        DefaultWorkSize(out_dims,
-                        DDim(std::vector<DDim::value_type>{
-                            static_cast<int64_t>(out_image_shape["width"]),
-                            static_cast<int64_t>(out_image_shape["height"])}));
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
-            << default_work_size[1] << ", " << default_work_size[2];
-#endif
-    cl_int status = kernel.setArg(arg_idx++, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *grid_img);
+    status = kernel.setArg(1, *grid_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, *out_img);
+    status = kernel.setArg(2, *out_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_height);
+    status = kernel.setArg(3, out_height);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(arg_idx++, out_width);
+    status = kernel.setArg(4, out_width);
    CL_CHECK_FATAL(status);
-    auto global_work_size =
+    auto& context = ctx_->As<OpenCLContext>();
-        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+    CHECK(context.cl_context() != nullptr);
-                    static_cast<cl::size_type>(default_work_size[1]),
-                    static_cast<cl::size_type>(default_work_size[2] / 4)};
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
-        global_work_size,
+        global_work_size_,
        cl::NullRange,
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
    context.cl_wait_list()->emplace(out_img, event_);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
-            << global_work_size[1] << " " << global_work_size[2];
-#endif
  }
 protected:
  param_t* grid_param_{nullptr};
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
  std::string kernel_func_name_{"grid_sampler"};
+  cl::Kernel kernel_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -37,53 +37,64 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
-    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
    context.cl_context()->AddKernel(
        kernel_func_name_, "image/scale_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  }
+  void ReInitWhenNeeded() override {
+    scale_param_ = param_.get_mutable<param_t>();
+    auto x_dims = scale_param_->x->dims();
+    if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
+        first_epoch_for_reinit_) {
+      last_x_dims_ = x_dims;
+      first_epoch_for_reinit_ = false;
+      // compute image shape
+      paddle::lite::CLImageConverterDefault default_convertor;
+      out_img_shape_ =
+          default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
+      // compute global work size
+      GetGlobalWorkSize();
+    }
+  }
+  void GetGlobalWorkSize() {
+    global_work_size_ =
+        cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
+                    static_cast<cl::size_type>(out_img_shape_[1])};
  }
  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
+    auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
-    const auto& in_dims = param.x->dims();
+    auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
-    auto* x_img = param.x->data<half_t, cl::Image2D>();
+        out_img_shape_[0], out_img_shape_[1]);
-    const float scale = param.scale;
+    const float scale = scale_param_->scale;
-    const float bias = param.bias;
+    const float bias = scale_param_->bias;
-    //    LOG(INFO) << "x_image" << x_img;
-    auto out_image_shape = InitImageDimInfoWith(in_dims);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
-            << out_image_shape["height"];
-#endif
-    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
-        out_image_shape["width"], out_image_shape["height"]);
-    //    LOG(INFO) << "out_image" << out_img;
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
-                    static_cast<cl::size_type>(out_image_shape["height"])};
+    auto kernel = kernel_;
    cl_int status;
-    int arg_idx = 0;
+    status = kernel.setArg(0, *x_img);
-    status = kernel.setArg(arg_idx, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
+    status = kernel.setArg(1, *out_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, scale);
+    status = kernel.setArg(2, scale);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, bias);
+    status = kernel.setArg(3, bias);
    CL_CHECK_FATAL(status);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
-        global_work_size,
+        global_work_size_,
        cl::NullRange,
        nullptr,
        event_.get());
@@ -95,6 +106,15 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{"scale"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
+  param_t* scale_param_{nullptr};
+  cl::Kernel kernel_;
+  bool first_epoch_for_reinit_{true};
+  DDim last_x_dims_;
+  DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
+      {static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
 };
 }  // namespace opencl