[LITE][OPENCL][Image] mv kernel init gws lws into prepare for run (#3285)

* [LITE][OPENCL][Image] mv kernel init gws lws into prepare for run, test=develop * [LITE][OPENCL][Image] shut down profile, test=develop * [LITE][OPENCL][Image] move log ahead, test=develop

[LITE][OPENCL][Image] mv kernel init gws lws into prepare for run (#3285)
* [LITE][OPENCL][Image] mv kernel init gws lws into prepare for run, test=develop * [LITE][OPENCL][Image] shut down profile, test=develop * [LITE][OPENCL][Image] move log ahead, test=develop
8f5e912e · xiebaiyuan · GitHub · e3cf724e · 8f5e912e · 8f5e912e
隐藏空白更改
内联并排

Showing with 328 addition and 440 deletion

lite/kernels/opencl/conv_image_compute.cc lite/kernels/opencl/conv_image_compute.cc +315 -440

lite/kernels/opencl/conv_image_compute.h lite/kernels/opencl/conv_image_compute.h +13 -0

未找到文件。
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -14,8 +14,8 @@

 #include "lite/kernels/opencl/conv_image_compute.h"

+#include <iomanip>
 #include <sstream>
-
 #include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
@@ -78,9 +78,27 @@ void ConvImageCompute::PrepareForRun() {
  VLOG(3) << "dilation_equal:" << dilation_equal;
  VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
          << paddings[2] << " " << paddings[3];
-
  CHECK(pad_equal && stride_equal && dilation_equal);

+  // general gws..
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  default_c_blk_ = default_work_size[0];
+  default_w_blk_ = default_work_size[1];
+  default_nh_blk_ = default_work_size[2];
+  c_blk_ = default_c_blk_;
+  w_blk_ = default_w_blk_;
+  nh_blk_ = default_nh_blk_;
+  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                  static_cast<size_t>(w_blk_),
+                                  static_cast<size_t>(nh_blk_)};
+
  if (kernel_h == 1 && kernel_w == 1) {
    // conv2d_1x1
    if (param.x->dims()[1] % 4 == 0) {
@@ -99,6 +117,15 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d1x1opt;
+    {
+      // calc 1x1 gws
+      w_blk_ = maptofactor(default_w_blk_, 4);
+      c_blk_ = default_c_blk_;
+      nh_blk_ = default_nh_blk_;
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
@@ -107,9 +134,38 @@ void ConvImageCompute::PrepareForRun() {
    if (stride_h == 1 && dilations[0] == 1) {
      kernel_func_names_.push_back("depth_conv2d_3x3s1");
      impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+      {
+        // depthwise spl gws s1
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+        int w_blk_size = 2;
+        int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+        c_blk_ = c_block;
+        w_blk_ = w_blk;
+        nh_blk_ = nh;
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
    } else {
      kernel_func_names_.push_back("depth_conv2d_3x3");
      impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+      {
+        // depthwise spl gws
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+
+        c_blk_ = c_block;
+        w_blk_ = w;
+        nh_blk_ = nh;
+
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
    }
    kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");

@@ -157,6 +213,22 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d3x3opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+
  } else if (kernel_h == 5 && kernel_w == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
@@ -189,6 +261,21 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d5x5opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #endif
 #undef CONV_5x5_OPT
  } else if (kernel_h == 7 && kernel_w == 7) {
@@ -223,6 +310,21 @@ void ConvImageCompute::PrepareForRun() {
        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());

    impl_ = &ConvImageCompute::Conv2d7x7opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
 #endif
 #undef CONV_7x7_OPT

@@ -270,9 +372,36 @@ void ConvImageCompute::PrepareForRun() {
    context.cl_context()->AddKernel(
        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
  }
+
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  size_t max_work_group_size = 0;
+  kernel_.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                   CL_KERNEL_WORK_GROUP_SIZE,
+                                   &max_work_group_size);
+
+  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+
+  if (max_work_group_size > 0 && use_lws) {
+    // local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
+    //     global_work_size_, max_work_group_size);
+    local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
+                                                           max_work_group_size);
+
+    VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
+            << local_work_size_[1] << "," << local_work_size_[2] << "}";
+  }
 }

 void ConvImageCompute::Conv2d1x1opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -302,16 +431,28 @@ void ConvImageCompute::Conv2d1x1opt() {
  int input_c = input_dims[1];
  auto dilations = *param.dilations;

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
+// const std::vector<size_t>& default_work_size =
+//     DefaultWorkSize(output_dims,
+//                     DDim(std::vector<DDim::value_type>{
+//                         static_cast<int64_t>(out_image_shape["width"]),
+//                         static_cast<int64_t>(out_image_shape["height"])}));
+
+// int c_block = default_work_size[0];
+// int w = default_work_size[1];
+// int nh = default_work_size[2];
+
+// int maped_w = maptofactor(w, 4);

-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
+// auto global_work_size_ =
+//     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+//                 static_cast<size_t>(maped_w),
+//                 static_cast<size_t>(default_work_size.data()[2])};

+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d_1x1 params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -331,9 +472,9 @@ void ConvImageCompute::Conv2d1x1opt() {
  VLOG(4) << "offset: " << offset;
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
 #endif
  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -350,27 +491,14 @@ void ConvImageCompute::Conv2d1x1opt() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int maped_w = maptofactor(w, 4);
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "maped_w: " << maped_w;
-  VLOG(4) << "hasbias: " << has_bias;
-#endif
-
+  auto kernel = kernel_;
  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, maped_w);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -401,49 +529,87 @@ void ConvImageCompute::Conv2d1x1opt() {
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, default_w_blk_);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(maped_w),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-#ifndef LITE_SHUTDOWN_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
      nullptr,
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
+
+#ifdef PROFILE_CONV_KERNEL
+  bool use_profile = false;
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  double start = GetCurrentUS();
+
+  if (use_profile) {
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size_,
+        local_work_size_,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  } else {
+    int count = 50;
+    double sumtime = 0;
+    if (!use_profile) {
+      count = 1;
+    }
+    for (size_t i = 0; i < count; i++) {
+      start = GetCurrentUS();
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size_,
+          local_work_size_,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_image, event_);
+      if (use_profile) {
+        event_->wait();
+        double duration = GetCurrentUS() - start;
+        sumtime += duration;
+      }
+    }
+
+    auto dims_string = [](DDimLite dims) -> std::string {
+      std::ostringstream stream;
+      stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
+             << dims[3] << "]";
+      return stream.str();
+    };
+    if (use_profile) {
+      // LOG(INFO) << "input: " << input_dims;
+      // LOG(INFO) << "filter: " << filter_dims;
+      // LOG(INFO) << "output: " << output_dims;
+
+      std::cout << std::setw(25) << std::left << dims_string(input_dims)
+                << std::setw(25) << std::left << dims_string(filter_dims)
+                << std::setw(25) << std::left << dims_string(output_dims)
+                << std::setw(25) << std::left << sumtime / count << std::endl;
+    } else {
+      dims_string(input_dims);
+    }
+  }
+#endif
 }

 void ConvImageCompute::Conv2d3x3() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -486,24 +652,14 @@ void ConvImageCompute::Conv2d3x3() {
  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
    new_groups = input_channel / filter_channel;
  }
-  /* TODO(ysh329): mobile has no case below
-     else {
-      LOG(FATAL) << "Not support conv3x3 case with"
-                 << " input_dims:" << input_dims << " output_dims:" <<
-    output_dims
-                 << " filter_dims:" << filter_dims;
-    }
-  */
-
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
+/* TODO(ysh329): mobile has no case below
+   else {
+    LOG(FATAL) << "Not support conv3x3 case with"
+               << " input_dims:" << input_dims << " output_dims:" <<
+  output_dims
+               << " filter_dims:" << filter_dims;
+  }
+*/

 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
@@ -527,9 +683,9 @@ void ConvImageCompute::Conv2d3x3() {
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
  VLOG(4) << "param.groups(groups):" << param.groups;
  VLOG(4) << "new_groups:" << new_groups;
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
 #endif

  CHECK_GE(dilations.size(), 2);
@@ -544,26 +700,15 @@ void ConvImageCompute::Conv2d3x3() {
  if (has_bias) {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -607,21 +752,16 @@ void ConvImageCompute::Conv2d3x3() {
  status = kernel.setArg(++arg_idx, new_groups);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());
@@ -630,6 +770,8 @@ void ConvImageCompute::Conv2d3x3() {
 }

 void ConvImageCompute::Conv2d3x3opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -657,24 +799,6 @@ void ConvImageCompute::Conv2d3x3opt() {
  const bool is_element_wise_bias =
      has_bias && param.output->dims() == param.bias->dims();

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-// default_work_size[2] = h_blk;
-
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -692,9 +816,6 @@ void ConvImageCompute::Conv2d3x3opt() {
  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif

  CHECK_GE(dilations.size(), 2);
@@ -710,24 +831,15 @@ void ConvImageCompute::Conv2d3x3opt() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -763,38 +875,17 @@ void ConvImageCompute::Conv2d3x3opt() {
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
-  }

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
      nullptr,
      event_.get());
  CL_CHECK_FATAL(status);
@@ -802,6 +893,8 @@ void ConvImageCompute::Conv2d3x3opt() {
 }

 void ConvImageCompute::Conv2d5x5() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -833,16 +926,6 @@ void ConvImageCompute::Conv2d5x5() {
  int input_c = input_dims[1];
  auto dilations = *param.dilations;

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -863,9 +946,6 @@ void ConvImageCompute::Conv2d5x5() {
  VLOG(4) << "offset: " << offset;
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif

  CHECK_GE(dilations.size(), 2);
@@ -881,25 +961,15 @@ void ConvImageCompute::Conv2d5x5() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -933,21 +1003,16 @@ void ConvImageCompute::Conv2d5x5() {
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());
@@ -956,6 +1021,8 @@ void ConvImageCompute::Conv2d5x5() {
 }

 void ConvImageCompute::Conv2d5x5opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -984,22 +1051,6 @@ void ConvImageCompute::Conv2d5x5opt() {
  const bool is_element_wise_bias =
      has_bias && param.output->dims() == param.bias->dims();

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
 // default_work_size[2] = h_blk;
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
@@ -1018,9 +1069,6 @@ void ConvImageCompute::Conv2d5x5opt() {
  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -1035,22 +1083,14 @@ void ConvImageCompute::Conv2d5x5opt() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;
  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -1083,38 +1123,13 @@ void ConvImageCompute::Conv2d5x5opt() {
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
-
-//  VLOG(4) << "out_image: " << out_image;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
+  //  VLOG(4) << "out_image: " << out_image;

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
      nullptr,
      event_.get());
  CL_CHECK_FATAL(status);
@@ -1122,6 +1137,8 @@ void ConvImageCompute::Conv2d5x5opt() {
 }

 void ConvImageCompute::Conv2d7x7() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -1153,16 +1170,6 @@ void ConvImageCompute::Conv2d7x7() {
  int input_c = input_dims[1];
  auto dilations = *param.dilations;

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1183,9 +1190,6 @@ void ConvImageCompute::Conv2d7x7() {
  VLOG(4) << "offset: " << offset;
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif

  CHECK_GE(dilations.size(), 2);
@@ -1201,25 +1205,15 @@ void ConvImageCompute::Conv2d7x7() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -1253,21 +1247,16 @@ void ConvImageCompute::Conv2d7x7() {
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());
@@ -1275,6 +1264,8 @@ void ConvImageCompute::Conv2d7x7() {
  context.cl_wait_list()->emplace(out_image, event_);
 }
 void ConvImageCompute::Conv2d7x7opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -1302,23 +1293,6 @@ void ConvImageCompute::Conv2d7x7opt() {
  const bool is_element_wise_bias =
      has_bias && param.output->dims() == param.bias->dims();

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  // default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-// default_work_size[2] = h_blk;
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ conv2d 7x7 params ============";
  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1336,9 +1310,6 @@ void ConvImageCompute::Conv2d7x7opt() {
  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif
  CHECK_GE(dilations.size(), 2);
  CHECK(dilations[0] == dilations[1]);
@@ -1353,24 +1324,15 @@ void ConvImageCompute::Conv2d7x7opt() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, h_blk);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -1403,39 +1365,19 @@ void ConvImageCompute::Conv2d7x7opt() {
  status = kernel.setArg(++arg_idx, output_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(w_blk),
-                  static_cast<size_t>(h_blk)};
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-#endif
-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
      nullptr,
      event_.get());
  CL_CHECK_FATAL(status);
  context.cl_wait_list()->emplace(out_image, event_);
 }
 void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto x_dims = param.x->dims();
  auto filter_dims = param.filter->dims();
@@ -1444,8 +1386,6 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
  auto strides = param.strides;
  auto dilations = *param.dilations;

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
  auto* input_img = param.x->data<half_t, cl::Image2D>();
  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();

@@ -1459,26 +1399,15 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
      image_shape["width"], image_shape["height"]);

-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-
-  int w_blk_size = 2;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_img);
  CL_CHECK_FATAL(status);
@@ -1516,28 +1445,11 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
  CL_CHECK_FATAL(status);

-  size_t max_work_group_size = 0;
-  kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
-                                  CL_KERNEL_WORK_GROUP_SIZE,
-                                  &max_work_group_size);
-  cl::NDRange local_work_size = cl::NullRange;
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "max_work_group_size: " << max_work_group_size;
-#endif
-  if (max_work_group_size > 0 && use_lws) {
-    local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
-                                                          max_work_group_size);
-#ifndef LITE_SHUTDOWN_LOG
-    VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
-            << local_work_size[1] << "," << local_work_size[2] << "}";
-#endif
-  }
-
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
-      local_work_size,
+      global_work_size_,
+      local_work_size_,
      nullptr,
      event_.get());
  CL_CHECK_FATAL(status);
@@ -1545,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
 }

 void ConvImageCompute::DepthwiseConv2d3x3() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto x_dims = param.x->dims();
  auto filter_dims = param.filter->dims();
@@ -1555,8 +1469,6 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  int offset = filter_dims[2] / 2 - paddings[0];
  int input_c_block = (x_dims[1] + 3) / 4;

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
  auto* input_img = param.x->data<half_t, cl::Image2D>();
  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();

@@ -1570,21 +1482,10 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
      image_shape["width"], image_shape["height"]);

-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-  auto global_work_size = cl::NDRange(c_block, w, nh);
+  auto kernel = kernel_;

 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "setArg";
-  VLOG(4) << "c_block = " << c_block;
-  VLOG(4) << "w = " << w;
-  VLOG(4) << "nh = " << nh;
-
  VLOG(4) << "strides = " << strides[0];
  VLOG(4) << "offset = " << offset;
  VLOG(4) << "dilations = " << dilations[0];
@@ -1597,11 +1498,11 @@ void ConvImageCompute::DepthwiseConv2d3x3() {

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_img);
  CL_CHECK_FATAL(status);
@@ -1641,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());
@@ -1650,6 +1551,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
 }

 void ConvImageCompute::DepthwiseConv2d() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
  const auto& param = *param_.get_mutable<param_t>();
  auto input_dims = param.x->dims();
  auto paddings = *param.paddings;
@@ -1681,16 +1584,6 @@ void ConvImageCompute::DepthwiseConv2d() {
  int input_c = input_dims[1];
  auto dilations = *param.dilations;

-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
 #ifndef LITE_SHUTDOWN_LOG
  VLOG(4) << "============ depthwise conv2d params ============";
  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
@@ -1710,9 +1603,6 @@ void ConvImageCompute::DepthwiseConv2d() {
  VLOG(4) << "offset: " << offset;
  VLOG(4) << "dilations.size : " << dilations.size();
  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
 #endif

  CHECK_GE(dilations.size(), 2);
@@ -1730,25 +1620,15 @@ void ConvImageCompute::DepthwiseConv2d() {
    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
  }

-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-#endif
+  auto kernel = kernel_;

  cl_int status;
  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
  CL_CHECK_FATAL(status);
  status = kernel.setArg(++arg_idx, *input_image);
  CL_CHECK_FATAL(status);
@@ -1786,21 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() {
  status = kernel.setArg(++arg_idx, filter_height);
  CL_CHECK_FATAL(status);

-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
 #ifndef LITE_SHUTDOWN_LOG
  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif

  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
-      global_work_size,
+      global_work_size_,
      cl::NullRange,
      nullptr,
      event_.get());
@@ -1809,7 +1684,7 @@ void ConvImageCompute::DepthwiseConv2d() {
 }

 void ConvImageCompute::Run() { (this->*impl_)(); }
-
+#undef PROFILE_CONV_KERNEL
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -59,6 +59,19 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  std::shared_ptr<cl::Event> event_{new cl::Event};
  Tensor filter_gpu_image_;
  Tensor bias_gpu_image_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  int c_blk_ = 1;
+  int w_blk_ = 1;
+  int nh_blk_ = 1;
+
+  int default_c_blk_ = 1;
+  int default_w_blk_ = 1;
+  int default_nh_blk_ = 1;
+
+  cl::Kernel kernel_;
+  cl::NDRange local_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  bool use_lws{true};
 };