[LITE][OPENCL]Fix opencl (#3433)

* [lite][opencl] remove event with clfinish, add strict check for cl warning. add conv 3x3opt fallback opt layout cast ,test=develop * [LITE][OPENCL]rm event in element_add_buffer_compute test=develop * [LITE][OPENCL]suite cl_functions_test.cc test=develop * [LITE][OPENCL] suite cl_common.sh lint check test=develop * [LITE][OPENCL] suite conv_image_compute.cc lint check test=develop * [LITE][OPENCL] suite cl_wait_list() lint check test=develop

[LITE][OPENCL]Fix opencl (#3433)
* [lite][opencl] remove event with clfinish, add strict check for cl warning. add conv 3x3opt fallback opt layout cast ,test=develop * [LITE][OPENCL]rm event in element_add_buffer_compute test=develop * [LITE][OPENCL]suite cl_functions_test.cc test=develop * [LITE][OPENCL] suite cl_common.sh lint check test=develop * [LITE][OPENCL] suite conv_image_compute.cc lint check test=develop * [LITE][OPENCL] suite cl_wait_list() lint check test=develop
9bd9311b · xiebaiyuan · GitHub · 3a04e11d · 9bd9311b · 9bd9311b
70 changed file
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
  size_t width = in_image.ImageWidth();
  size_t height = in_image.ImageHeight();
  auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
  CL_CHECK_FATAL(status);
  status = context->GetCommandQueue().finish();
  CL_CHECK_FATAL(status);
+#if 0
  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+#endif
  LOG(INFO) << out_image;
 }

--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
            i2 += 4;
            p++;
          } else {
-            image[i2] = 0.0;
+            image[i2] = Float2Half(0.f);
            i2 += 4;
          }
        }
@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
            image[index] = Float2Half(*p);
            p++;
          } else {
-            image[index] = 0.0;
+            image[index] = Float2Half(0.f);
          }
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";

--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 /////////////////////////////////
@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
 #endif
 #ifdef RELU6
-  output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6);
+  in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
 #endif
  return output;
 }
--- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -14,36 +14,30 @@ limitations under the License. */
 #include <cl_common.h>
 __kernel void relu(__read_only image2d_t input,
                   __write_only image2d_t output,
                   __private const float threshold,
                   __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f), in);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void relu6(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
-                   __private const float scale){
+                    __private const float scale) {
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 __kernel void sigmoid(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
@@ -64,10 +57,11 @@ __kernel void sigmoid(__read_only image2d_t input,
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out;
-  out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-  out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+  out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
-  out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+  out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
-  out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+  out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
+  out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
@@ -79,22 +73,21 @@ __kernel void leaky_relu(__read_only image2d_t input,
  const int x = get_global_id(0);
  const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
-  if (in.x < 0.0f){
+  if (in.x < 0.0f) {
    in.x = s_val.x;
  }
-  if (in.y < 0.0f){
+  if (in.y < 0.0f) {
    in.y = s_val.y;
  }
-  if (in.z < 0.0f){
+  if (in.z < 0.0f) {
    in.z = s_val.z;
  }
-  if (in.w < 0.0f){
+  if (in.w < 0.0f) {
    in.w = s_val.w;
  }
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
@@ -104,16 +97,14 @@ __kernel void tanh_act(__read_only image2d_t input,
                       __write_only image2d_t output,
                       __private const float threshold,
                       __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
@@ -121,13 +112,11 @@ __kernel void exp_act(__read_only image2d_t input,
                      __write_only image2d_t output,
                      __private const float threshold,
                      __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = exp(in);
@@ -138,16 +127,13 @@ __kernel void swish(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold,
                    __private const float scale) {
  const int x = get_global_id(0);  // image_width
  const int y = get_global_id(1);  // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                            CLK_ADDRESS_CLAMP |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                            CLK_FILTER_NEAREST;
  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
  CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
 }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
 #include <cl_common.h>
+__kernel void conv2d_1x1_opt(
-__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
+    __private const int global_size_dim0,
    __private const int global_size_dim1,
    __private const int global_size_dim2,
    __read_only image2d_t input_image,
@@ -9,7 +9,7 @@ __kernel void conv2d_1x1_opt(__private const int global_size_dim0,
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,
@@ -27,7 +27,10 @@ __read_only image2d_t new_scale,
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
  int out_w0 = out_w;
  int out_w1 = out_w + global_size_dim1;
  int out_w2 = out_w + global_size_dim1 * 2;
@@ -73,10 +76,10 @@ __read_only image2d_t new_scale,
  CL_DTYPE4 output3 = output0;
 #else
-  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output3 = 0.0f;
+  CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
  int max_w_bound = input_c_block * input_width;
@@ -85,6 +88,14 @@ __read_only image2d_t new_scale,
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                         in_pos_in_one_block0.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input0 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
@@ -131,6 +142,14 @@ __read_only image2d_t new_scale,
    // -------------1--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                    in_pos_in_one_block1.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input1 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
@@ -167,6 +186,14 @@ __read_only image2d_t new_scale,
    // -------------2--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
                    in_pos_in_one_block2.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input2 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
@@ -203,6 +230,14 @@ __read_only image2d_t new_scale,
    // -------------3--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                    in_pos_in_one_block3.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input3 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
@@ -287,7 +322,7 @@ __kernel void conv2d_1x1_simple(
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
    __read_only image2d_t new_biase,
 #endif
    __write_only image2d_t output_image,
@@ -304,7 +339,10 @@ __read_only image2d_t new_scale,
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
  int out_w0 = out_w;
  int out_w1 = out_w + global_size_dim1;
  int out_w2 = out_w + global_size_dim1 * 2;
@@ -350,16 +388,25 @@ __read_only image2d_t new_scale,
  CL_DTYPE4 output3 = output0;
 #else
-  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output3 = 0.0f;
+  CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                         in_pos_in_one_block0.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input0 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
@@ -379,6 +426,15 @@ __read_only image2d_t new_scale,
    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                    in_pos_in_one_block1.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input1 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output1 = mad(input1.x, weight0, output1);
@@ -388,6 +444,14 @@ __read_only image2d_t new_scale,
    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
                    in_pos_in_one_block2.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input2 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output2 = mad(input2.x, weight0, output2);
@@ -397,6 +461,16 @@ __read_only image2d_t new_scale,
    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                    in_pos_in_one_block3.y);
+    pos_in.x = select(
+        pos_in.x,
+        -1,
+        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
+         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
+    pos_in.y =
+        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input3 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output3 = mad(input3.x, weight0, output3);
@@ -428,6 +502,16 @@ __read_only image2d_t new_scale,
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
+  // const int debug_pos = 0;
+  // int2 pos_test = (int2)(debug_pos, debug_pos);
+  // if (input_height == 112 && input_width == 112 && output_width == 112 &&
+  //     output_height == 112) {
+  //   output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_test);
+  //   output1 = output0;
+  //   output2 = output1;
+  //   output3 = output2;
+  // }
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
  }

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -27,33 +27,33 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                         __private const int offset,
                         __private const int input_c,
                         __private const int dilation,
-                         __private const int input_width,/* of one block */
+                         __private const int input_width,  /* of one block */
-                         __private const int input_height,/* of one block */
+                         __private const int input_height, /* of one block */
                         __private const int output_width,
                         __private const int output_height,
                         __private const int output_c,
                         __private const int filter_channel,
                         __private const int filter_width,
                         __private const int filter_height,
-                         __private const int group) {
+                         __private const int group,
+                         __private const int input_tensor_c
+) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+  const sampler_t sampler =
-                              CLK_ADDRESS_CLAMP          |
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-                              CLK_FILTER_NEAREST;
  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-    if (out_c >= global_size_dim0 ||
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-        out_w >= global_size_dim1 ||
      out_nh >= global_size_dim2) {
    return;
  }
  int2 stride_xy;
  stride_xy.x = stride;
  stride_xy.y = stride;
@@ -67,80 +67,167 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
 #ifdef BIASE_CH
-    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
 #else
-    CL_DTYPE4 output = 0.0f;
+  CL_DTYPE4 output = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
  CL_DTYPE4 input[9];  // 3x3 region of input
  if (group == 1) {
    for (int i = 0; i < input_c; ++i) {  // each run for 3x3
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                           in_pos_in_one_block.y);
-            input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[0] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
-            input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[1] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y - dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y - dilation >= input_height)
+                           << 15));
-            input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[2] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y - dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y - dilation >= input_height)
+                    << 15));
-            input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[3] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x - dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x - dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
-            input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[4] = select(
-                              (int2)(pos_in.x, pos_in.y)),
+          READ_IMG_TYPE(
-                              (CL_DTYPE4)(0.0f),
+              CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
+          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+                     in_pos_in_one_block.x >= input_width ||
+                     in_pos_in_one_block.y >= input_height)
+                    << 15));
-            input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[5] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x + dilation, pos_in.y)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                            in_pos_in_one_block.y < 0 ||
+                            in_pos_in_one_block.x + dilation >= input_width ||
+                            in_pos_in_one_block.y >= input_height)
+                           << 15));
-            input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[6] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x - dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
-            input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[7] =
+          select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                               input_image,
+                               sampler,
                               (int2)(pos_in.x, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+                 (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+                 (ushort4)((in_pos_in_one_block.x < 0 ||
+                            in_pos_in_one_block.y + dilation < 0 ||
+                            in_pos_in_one_block.x >= input_width ||
+                            in_pos_in_one_block.y + dilation >= input_height)
+                           << 15));
-            input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+      input[8] = select(
+          READ_IMG_TYPE(CL_DTYPE_CHAR,
+                        input_image,
+                        sampler,
                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (CL_DTYPE4)(0.0f),
+          (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+          (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                     in_pos_in_one_block.y + dilation < 0 ||
+                     in_pos_in_one_block.x + dilation >= input_width ||
+                     in_pos_in_one_block.y + dilation >= input_height)
+                    << 15));
+      if (i == input_c - 1) {
+        int c_shr = input_tensor_c % 4;
+        if (c_shr == 1) {
+          for (int k = 0; k < 9; k++) {
+            input[k].y = (half)0.f;
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 2) {
+          for (int k = 0; k < 9; k++) {
+            input[k].z = (half)0.f;
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 3) {
+          for (int k = 0; k < 9; k++) {
+            input[k].w = (half)0.f;
+          }
+        } else if (c_shr == 0) {
+        }
+      }
      int j = 0;
      int2 pos_of_weight;
      pos_of_weight.x = i * 3 + j % 3;
      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_x =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.x += dot(input[j], weight_x);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_y =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.y += dot(input[j], weight_y);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_z =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.z += dot(input[j], weight_z);
      pos_of_weight.y += 3;
-                CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+      CL_DTYPE4 weight_w =
+          READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
      output.w += dot(input[j], weight_w);
      j = 1;
@@ -297,79 +384,98 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
                             in_pos_in_one_block.y);
        input[0] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-              (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                       in_pos_in_one_block.y - dilation < 0 ||
                       in_pos_in_one_block.x - dilation >= input_width ||
                       in_pos_in_one_block.y - dilation >= input_height)
                      << 15));
        input[1] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x, pos_in.y - dilation)),
-                     (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x < 0 ||
                              in_pos_in_one_block.y - dilation < 0 ||
                              in_pos_in_one_block.x >= input_width ||
                              in_pos_in_one_block.y - dilation >= input_height)
                             << 15));
        input[2] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                       in_pos_in_one_block.y - dilation < 0 ||
                       in_pos_in_one_block.x + dilation >= input_width ||
                       in_pos_in_one_block.y - dilation >= input_height)
                      << 15));
-          input[3] = select(
+        input[3] =
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x - dilation, pos_in.y)),
-                          (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                              in_pos_in_one_block.y < 0 ||
                              in_pos_in_one_block.x - dilation >= input_width ||
                              in_pos_in_one_block.y >= input_height)
                             << 15));
        input[4] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)),
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
-                          (CL_DTYPE4)(0.0f),
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
                       in_pos_in_one_block.x >= input_width ||
                       in_pos_in_one_block.y >= input_height)
                      << 15));
        input[5] =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x + dilation, pos_in.y)),
-                   (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                              in_pos_in_one_block.y < 0 ||
                              in_pos_in_one_block.x + dilation >= input_width ||
                              in_pos_in_one_block.y >= input_height)
                             << 15));
        input[6] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
                       in_pos_in_one_block.y + dilation < 0 ||
                       in_pos_in_one_block.x - dilation >= input_width ||
                       in_pos_in_one_block.y + dilation >= input_height)
                      << 15));
        input[7] =
-              select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                 input_image,
+                                 sampler,
                                 (int2)(pos_in.x, pos_in.y + dilation)),
-                     (CL_DTYPE4)(0.0f),
+                   (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
                   (ushort4)((in_pos_in_one_block.x < 0 ||
                              in_pos_in_one_block.y + dilation < 0 ||
                              in_pos_in_one_block.x >= input_width ||
                              in_pos_in_one_block.y + dilation >= input_height)
                             << 15));
        input[8] = select(
-              READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (CL_DTYPE4)(0.0f),
+            (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f),
            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
                       in_pos_in_one_block.y + dilation < 0 ||
                       in_pos_in_one_block.x + dilation >= input_width ||
@@ -381,7 +487,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
          int2 pos_of_weight;
          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-            CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
+          CL_DTYPE4 weight =
+              READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight);
          int f_c_offset = f_c % 4;
          CL_DTYPE f_value;

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <cl_common.h>
 __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_w,
                             __private const int item_h,

--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+__kernel void buffer_to_image2d(__global CL_DTYPE* in,
                                __write_only image2d_t output_image,
                                __private const int out_H,
                                __private const int out_W,
@@ -26,11 +26,14 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                __private const int Stride0,
                                __private const int Stride1,
                                __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
+  if (out_c >= out_C || out_w >= out_W || out_nh >= out_H) {
+    return;
+  }
  const int out_n = out_nh / out_H;
  const int out_h = out_nh % out_H;
@@ -56,26 +59,41 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
  if (out_C - 4 * out_c >= 2) {
    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
+  } else {
+    output.y = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }
  if (out_C - 4 * out_c >= 3) {
    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
+  } else {
+    output.z = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }
  if (out_C - 4 * out_c >= 4) {
    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
+  } else {
+    output.w = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }
 #ifdef DEBUG
  if (out_w > 2045) {
-    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
+    printf(
+        "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
+        "%.2f\n",
        out_w,
        out_C - 4 * out_c,
        (float)(in[input_pos0]),
        (float)(in[input_pos1]),
        (float)(in[input_pos2]),
        (float)(in[input_pos3]));
-    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
-           output_pos.x, output_pos.y,
+           out_c,
-           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+           out_w,
+           out_nh,
+           output_pos.x,
+           output_pos.y,
+           (float)(output.x),
+           (float)(output.y),
+           (float)(output.z),
+           (float)(output.w));
  }
 #endif
@@ -104,30 +122,40 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
+      CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
 #ifdef DEBUG
  if (in_w > 2045) {
-    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
-            pos_x, in_nh,
+           in_c,
-           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+           in_w,
+           in_nh,
+           pos_x,
+           in_nh,
+           (float)(in.x),
+           (float)(in.y),
+           (float)(in.z),
+           (float)(in.w));
  }
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 3) {
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
  }
-  if(C - 4 * in_c >= 4) {
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
  }
 }
 #if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 ////////////////////////////////////////////////////////
 // buffer -> image2d_nw
@@ -182,7 +210,6 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
 }
 #endif
 #if 0  // NOTE(ysh329): keep, un-used from paddle-mobile
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
@@ -208,15 +235,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
 ////////////////////////////////////////////////////////
 // buffer -> image2d (divide by 255 to normalize)
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+__kernel void buffer_to_image2d_with_pre255(__global uchar* in,
                                            __write_only image2d_t output_image,
                                            __private const int out_H,
                                            __private const int out_W,
                                            __private const int out_C,
                                            __private const int Stride0,
                                            __private const int Stride1,
-                                            __private const int Stride2){
+                                            __private const int Stride2) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -231,7 +257,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  const int in_h = out_h;
  const int in_w = out_w;
  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
@@ -243,19 +268,18 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
-  if(out_C - 4 * out_c>=2){
+  if (out_C - 4 * out_c >= 2) {
    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=3){
+  if (out_C - 4 * out_c >= 3) {
    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
  }
-  if(out_C - 4 * out_c>=4){
+  if (out_C - 4 * out_c >= 4) {
    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
  }
  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }
 ////////////////////////////////////////////////////////
 // image2d -> buffer (multiply by 255 to de-normalize)
 ////////////////////////////////////////////////////////
@@ -277,22 +301,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+  CL_COMPUTE_DTYPE4 in =
+      READ_IMG_TYPE(
+          CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
+      255;
 #ifdef DEBUG
  printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
-          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+         in_c,
+         in_w,
+         in_nh,
+         pos_x,
+         in_nh,
+         in.x,
+         in.y,
+         in.z,
+         in.w);
 #endif
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  const int index =
+      in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
  out[index] = convert_uchar_sat(in.x);
-  if(C - 4 * in_c>=2){
+  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = convert_uchar_sat(in.y);
  }
-  if(C - 4 * in_c>=3){
+  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = convert_uchar_sat(in.z);
  }
-  if(C - 4 * in_c>=4){
+  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = convert_uchar_sat(in.w);
  }
 }
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
  bool is_device_init = InitializeDevice();
  is_init_success_ = is_platform_init && is_device_init;
  initialized_ = true;
+  context_ = CreateContext();
+  command_queue_ = CreateCommandQueue(context());
  return initialized_;
 }
@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
 cl::Context& CLRuntime::context() {
  if (context_ == nullptr) {
-    context_ = CreateContext();
+    LOG(FATAL) << "context_ create failed. ";
  }
  return *context_;
 }
@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
 cl::CommandQueue& CLRuntime::command_queue() {
  if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
+    LOG(FATAL) << "command_queue_ create failed. ";
  }
  return *command_queue_;
 }
@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
  /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
-  std::string build_option = options + " -cl-fast-relaxed-math ";
+  std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
  VLOG(4) << "OpenCL build_option: " << build_option;
  status_ = program->build({*device_}, build_option.c_str());
  CL_CHECK_ERROR(status_);

--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_USE_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
+                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,
                      cl_image2d_height,
@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                 size_t size,
                                 IoDirection dir) {
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
                                        0,
                                        size,
                                        nullptr,
-                                        &event);
+                                        nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
  cl::array<size_t, 3> origin = {0, 0, 0};
  cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
  cl_int status;
-  cl::Event event;
  auto stream = CLRuntime::Global()->command_queue();
  switch (dir) {
    case IoDirection::DtoD:
@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
                                       origin,
                                       region,
                                       nullptr,
-                                       &event);
+                                       nullptr);
      CL_CHECK_FATAL(status);
-      event.wait();
+      CLRuntime::Global()->command_queue().finish();
      break;
    case IoDirection::HtoD:
      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -340,27 +340,17 @@ class Context<TargetType::kX86> {
 template <>
 class Context<TargetType::kOpenCL> {
  std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const void*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;
 public:
  CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
  void InitOnce() {
    // Init cl runtime.
    CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
    cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
  }
-  void CopySharedTo(OpenCLContext* ctx) {
+  void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
 };
 #endif

--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -62,23 +62,21 @@ class ReluCompute
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange{count};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
  }
 private:
  std::string kernel_func_name_{"relu"};
  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 class SigmoidCompute
@@ -121,23 +119,21 @@ class SigmoidCompute
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange{count};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
  }
 private:
  std::string kernel_func_name_{"sigmoid"};
  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
@@ -85,16 +85,9 @@ TEST(opencl_relu_buffer, compute) {
  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
+  CLRuntime::Global()->command_queue().finish();
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  // run compute ref and check
  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
@@ -145,16 +138,9 @@ TEST(opencl_sigmoid_buffer, compute) {
  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
+  CLRuntime::Global()->command_queue().finish();
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  // run compute ref and check
  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);

--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -147,16 +147,15 @@ class ActivationComputeImageDefault
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size_,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
@@ -175,7 +174,6 @@ class ActivationComputeImageDefault
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl
 }  // namespace kernels

--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -234,19 +234,9 @@ TEST(act_image2d_fp16, compute) {
                img_to_buf_kernel->Launch();
                // wait for opencl
-                auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
                auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
-                if (it != wait_list->end()) {
+                CLRuntime::Global()->command_queue().finish();
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto &event = *(it->second);
-                  event.wait();
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
                // compute ref cpu
                act_compute_ref<float>(

--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -142,16 +142,14 @@ class BilinearInterpImageCompute
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
@@ -163,7 +161,6 @@ class BilinearInterpImageCompute
  std::string kernel_func_name_{"bilinear_interp"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
@@ -187,18 +187,7 @@ TEST(bilinear_interp_image2d, compute) {
                  // LOG(INFO) << "out_image:" << out_image;
                  kernel->Launch();
-                  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                  CLRuntime::Global()->command_queue().finish();
-                  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-                  auto it = wait_list->find(out_ptr);
-                  if (it != wait_list->end()) {
-                    VLOG(4) << "--- Find the sync event for the target cl "
-                               "tensor. ---";
-                    auto& event = *(it->second);
-                    event.wait();
-                  } else {
-                    LOG(FATAL) << "Could not find the sync event for the "
-                                  "target cl tensor.";
-                  }
                  std::unique_ptr<float[]> out_ref(
                      new float[out_dim.production()]);

--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
@@ -47,8 +47,10 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
    }
    CHECK(context.cl_context() != nullptr);
    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
-    context.cl_context()->AddKernel(
+    context.cl_context()->AddKernel(kernel_func_name_,
-        kernel_func_name_, "image/box_coder_kernel.cl", build_options_);
+                                    "image/box_coder_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
  }
  void Run() override {
@@ -81,7 +83,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
      auto& context = ctx_->As<OpenCLContext>();
      CHECK(context.cl_context() != nullptr);
      STL::stringstream kernel_key;
-      kernel_key << kernel_func_name_ << build_options_;
+      kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
      auto kernel = context.cl_context()->GetKernel(kernel_key.str());
      auto default_work_size =
@@ -120,16 +122,14 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
          cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
                      static_cast<cl::size_type>(default_work_size[2])};
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
          kernel,
          cl::NullRange,
          global_work_size,
          cl::NullRange,
          nullptr,
-          event_.get());
+          nullptr);
      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
 #ifndef LITE_SHUTDOWN_LOG
      VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
@@ -142,7 +142,7 @@ class BoxCoderComputeImage : public KernelLite<TARGET(kOpenCL),
  param_t* boxcoder_param_{nullptr};
  std::string kernel_func_name_{};
  std::string build_options_{" -DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{nullptr};
+  std::string time_stamp_{GetTimeStamp()};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/box_coder_image_compute_test.cc
+++ b/lite/kernels/opencl/box_coder_image_compute_test.cc
@@ -216,18 +216,7 @@ TEST(box_coder_image2d, compute) {
                out_image_shape[0], out_image_shape[1]);
            kernel->Launch();
-            auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+            CLRuntime::Global()->command_queue().finish();
-            auto* out_ptr = param.proposals->data<half_t, cl::Image2D>();
-            auto it = wait_list->find(out_ptr);
-            if (it != wait_list->end()) {
-              VLOG(4) << "--- Find the sync event for the target cl "
-                         "tensor. ---";
-              auto& event = *(it->second);
-              event.wait();
-            } else {
-              LOG(FATAL) << "Could not find the sync event for the "
-                            "target cl tensor.";
-            }
            lite::Tensor out_ref_tensor;
            out_ref_tensor.Resize(out_dim);

--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -123,16 +123,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, total1);
      CL_CHECK_FATAL(status);
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
          kernel,
          cl::NullRange,
          global_work_size,
          cl::NullRange,
          nullptr,
-          event_.get());
+          nullptr);
      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
    } else {
      auto start = 0;
      for (int i = 0; i < inputs.size(); i++) {
@@ -157,16 +156,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
        CL_CHECK_FATAL(status);
        status = kernel.setArg(++arg_idx, total0);
        CL_CHECK_FATAL(status);
-        event_ = std::shared_ptr<cl::Event>(new cl::Event);
        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            global_work_size,
            cl::NullRange,
            nullptr,
-            event_.get());
+            nullptr);
        CL_CHECK_FATAL(status);
-        context.cl_wait_list()->emplace(out_buf, event_);
        start += size;
      }
    }
@@ -182,7 +180,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
@@ -142,16 +142,7 @@ TEST(opencl_concat_buffer, compute) {
  kernel->SetContext(std::move(concat_context));
  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto *out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  // run compute ref and check
  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -187,16 +187,15 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, width_);
      CL_CHECK_FATAL(status);
-      event_ = std::shared_ptr<cl::Event>(new cl::Event);
      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
          kernel,
          cl::NullRange,
          global_work_size,
          cl::NullRange,
          nullptr,
-          event_.get());
+          nullptr);
      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
    } else {
      auto start = 0;
      for (int i = 0; i < inputs.size(); i++) {
@@ -231,16 +230,15 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
        status = kernel.setArg(++arg_idx, width_);
        CL_CHECK_FATAL(status);
        CL_CHECK_FATAL(status);
-        event_ = std::shared_ptr<cl::Event>(new cl::Event);
        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            global_work_size,
            cl::NullRange,
            nullptr,
-            event_.get());
+            nullptr);
        CL_CHECK_FATAL(status);
-        context.cl_wait_list()->emplace(out_buf, event_);
        start += inputs[i]->dims()[axis_];
      }
    }
@@ -256,7 +254,6 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{};
  std::string build_options_{" -DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
@@ -245,20 +245,7 @@ TEST(concat_image2d, compute) {
            LOG(INFO) << "run kernel: img_to_buf_kernel";
            img_to_buf_kernel->Launch();
-            // wait for opencl
+            CLRuntime::Global()->command_queue().finish();
-            auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-            auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-            auto it = wait_list->find(out_ptr);
-            if (it != wait_list->end()) {
-              VLOG(4) << "--- Find the sync event for the target cl "
-                         "tensor. ---";
-              auto &event = *(it->second);
-              event.wait();
-            } else {
-              LOG(FATAL) << "Could not find the sync event for the target "
-                            "cl tensor.";
-            }
            // compute ref cp_u
            std::vector<const float *> ins_ptr;

--- a/lite/kernels/opencl/conv_buffer_compute.cc
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -205,7 +205,7 @@ void ConvCompute::GemmlikeConv2d() {
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        img2col_kernel,
        cl::NullRange,
@@ -301,17 +301,14 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
  status = kernel.setArg(++arg_idx, batch_size);
  CL_CHECK_FATAL(status);
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
      global_work_size,
      local_work_size,
      nullptr,
-      event_.get());
+      nullptr);
  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(output_d, event_);
 }
 void ConvCompute::Run() { (this->*impl_)(); }

--- a/lite/kernels/opencl/conv_buffer_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -57,7 +57,6 @@ class ConvCompute
  std::vector<std::string> kernel_func_paths_{};
  std::vector<std::string> build_options_{};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/conv_buffer_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
@@ -304,25 +304,14 @@ TEST(conv2d, compute_conv2d_1x1) {
                // run opencl kernel
                kernel->Launch();
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                CLRuntime::Global()->command_queue().finish();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
+                // double start_nanos =
-                auto it = wait_list->find(out_ptr);
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                if (it != wait_list->end()) {
+                // double stop_nanos =
-                  VLOG(4) << "--- Find the sync event for the target cl "
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                             "tensor. ---";
+                // double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  auto& event = *(it->second);
+                // LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                  event.wait();
+                //           << " us.";
-                  double start_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
                // run cpu ref
                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
@@ -536,25 +525,15 @@ TEST(conv2d, compute_conv2d_gemm) {
                // run opencl kernel
                kernel->Launch();
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                CLRuntime::Global()->command_queue().finish();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
+                // double start_nanos =
-                auto it = wait_list->find(out_ptr);
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                if (it != wait_list->end()) {
+                // double stop_nanos =
-                  VLOG(4) << "--- Find the sync event for the target cl "
+                //     event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                             "tensor. ---";
+                // double elapsed_micros = (stop_nanos - start_nanos) /
-                  auto& event = *(it->second);
+                // 1000.0;
-                  event.wait();
+                // LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                  double start_nanos =
+                //           << " us.";
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
                // run cpu ref
                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));

--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -58,9 +58,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
  std::vector<std::string> kernel_func_paths_{};
  std::vector<std::string> build_options_{};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
  std::unique_ptr<Tensor> filter_gpu_image_{nullptr};
  std::unique_ptr<Tensor> bias_gpu_image_{nullptr};
+  std::unique_ptr<Tensor> tensor_hold_filter_image_{nullptr};
+  std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
  cl::NDRange global_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  int c_blk_ = 1;

--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -395,19 +395,7 @@ TEST(conv2d, compute_image2d_1x1) {
              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                  out_image_width, out_image_height);
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              CLRuntime::Global()->command_queue().finish();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target"
-                              "cl tensor.";
-              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                          output.data<half_t, cl::Image2D>(),
@@ -530,11 +518,11 @@ const int stride = 2;
                const int iw = 3;
                const int oc = 2;
 #else  // big scale with group
-  const int stride = 1;
+  const int stride = 2;
-  const int group = 32 / 1;
+  const int group = 1;
-  const int batch_size = 2;
+  const int batch_size = 1;
-  const int ic = 32 / 1;
+  const int ic = 3 / 1;
-  const int ih = 112 / 1;
+  const int ih = 224 / 1;
  const int iw = 112 / 1;
  const int oc = 32 / 1;
 #endif
@@ -652,10 +640,10 @@ const int stride = 2;
              SHADOW_LOG << "gen input and filter ...";
              for (int i = 0; i < input_v.size(); ++i) {
-                input_v[i] = i * 0.001;  // gen(engine);
+                input_v[i] = gen(engine);
              }
              for (int i = 0; i < filter_v.size(); ++i) {
-                filter_v[i] = 1 * 0.001;  // gen(engine);
+                filter_v[i] = gen(engine);
              }
              SHADOW_LOG << "after gen input and filter ...";
@@ -763,20 +751,7 @@ const int stride = 2;
              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                  out_image_width, out_image_height);
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              CLRuntime::Global()->command_queue().finish();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                          output.data<half_t, cl::Image2D>(),
                                          out_image_width,
@@ -848,8 +823,13 @@ const int stride = 2;
              for (int i = 0; i < out_dim.production(); i++) {
                auto relative_diff =
                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
-                if (relative_diff > FP16_MAX_DIFF) {
+                // EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                // EXPECT_LT(abs_diff, FP16_ABS_DIFF);
+                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                             abs_diff > FP16_ABS_DIFF);
+                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
                             << "]:" << output_v[i] << " "
                                                       "out_ref_data["
@@ -1115,19 +1095,7 @@ TEST(conv2d, compute_image2d_5x5) {
              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                  out_image_width, out_image_height);
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              CLRuntime::Global()->command_queue().finish();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                          output.data<half_t, cl::Image2D>(),
@@ -1468,19 +1436,7 @@ TEST(conv2d, compute_image2d_7x7) {
              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                  out_image_width, out_image_height);
-              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              CLRuntime::Global()->command_queue().finish();
-              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-              auto it = wait_list->find(out_ptr);
-              if (it != wait_list->end()) {
-                SHADOW_LOG << "--- Find the sync event for the target cl "
-                              "tensor. ---";
-                auto& event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                          output.data<half_t, cl::Image2D>(),

--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
@@ -108,23 +108,21 @@ class DepthwiseConv2dCompute
    status = kernel.setArg(++arg_idx, *bias_buf);
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
  }
 private:
  std::string kernel_func_name_{"depthwise_conv2d"};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
@@ -137,16 +137,7 @@ TEST(depthwise_conv2d_buffer_fp32, compute) {
  output.Resize({4, 32, 110, 110});
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  lite::Tensor output_ref;
  output_ref.Resize({4, 32, 110, 110});

--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -312,19 +312,7 @@ TEST(depthwise_conv2d, compute_basic) {
          auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
              out_image_width, out_image_height);
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          CLRuntime::Global()->command_queue().finish();
-          auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-          auto it = wait_list->find(out_ptr);
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl "
-                       "tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL) << "Could not find the sync event for the target "
-                          "cl tensor.";
-          }
          TargetWrapperCL::ImgcpySync(out_image_v.data(),
                                      output.data<half_t, cl::Image2D>(),
@@ -503,20 +491,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        kernel->Launch();
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+        CLRuntime::Global()->command_queue().finish();
-        auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-          LOG(INFO)
-              << "Could not find the sync event for the target cl tensor.";
-        }
        lite::Tensor out_ref;
        out_ref.Resize(output_dim);

--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -89,23 +89,20 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size.data()[1]),
                    static_cast<cl::size_type>(default_work_size.data()[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
  std::string kernel_func_name_{"dropout"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/dropout_image_compute_test.cc
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
@@ -86,16 +86,7 @@ TEST(dropout_image2d_fp16, compute) {
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  dropout(input_v.data(), in_dim, out_ref.get(), 0.6);

--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -63,16 +63,10 @@ void ElementwiseAddCompute::Run() {
  CL_CHECK_FATAL(status);
  auto global_work_size = cl::NDRange{channels_, batch_};
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_buf, event_);
 }
 void ElementwiseAddCompute::UpdateParams() {

--- a/lite/kernels/opencl/elementwise_add_buffer_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -48,7 +48,6 @@ class ElementwiseAddCompute
  std::string kernel_func_name_{"elementwise_add"};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
@@ -144,16 +144,7 @@ TEST(elementwise_add_buffer, compute) {
  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  elementwise_compute_ref<float>(
@@ -225,16 +216,7 @@ TEST(fusion_elementwise_add_activation_buffer, compute) {
  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  elementwise_compute_ref<float>(

--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -153,16 +153,15 @@ void ElementwiseAddImageCompute::Run() {
  auto& context = ctx_->As<OpenCLContext>();
  CHECK(context.cl_context() != nullptr);
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
      kernel,
      cl::NullRange,
      global_work_size_,
      cl::NullRange,
      nullptr,
-      event_.get());
+      nullptr);
  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
 }
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_add_image_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -63,7 +63,6 @@ class ElementwiseAddImageCompute
  cl::Kernel kernel_;
  cl::NDRange global_work_size_ = cl::NDRange{
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -50,8 +50,10 @@ void ElementwiseMulFloatImageCompute::PrepareForRun() {
  VLOG(4) << "y_dims.size():" << y_dims.size();
  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
+  context.cl_context()->AddKernel(kernel_func_name_,
-      kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+                                  "image/elementwise_mul_kernel.cl",
+                                  build_options_,
+                                  time_stamp_);
 }
 void ElementwiseMulFloatImageCompute::Run() {
@@ -88,7 +90,7 @@ void ElementwiseMulFloatImageCompute::Run() {
          << out_img_shape[1];
  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
+  kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
  int arg_idx = 0;
@@ -150,16 +152,16 @@ void ElementwiseMulFloatImageCompute::Run() {
  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                      static_cast<cl::size_type>(x_img_height)};
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
  auto  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel
      kernel,
      cl::NullRange,
      global_work_size,
      cl::NullRange,
      nullptr,
-      event_.get());
+      nullptr);
  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
+  std::string time_stamp_{GetTimeStamp()};
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 }

--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -185,16 +185,15 @@ class ElementwiseMulImageCompute
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(x_img_width),
                    static_cast<cl::size_type>(x_img_height)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
@@ -205,7 +204,6 @@ class ElementwiseMulImageCompute
  std::string kernel_func_name_{"elementwise_mul"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -138,16 +138,9 @@ void ElementwiseSubImageCompute::Run() {
  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
 #endif
-  event_ = std::shared_ptr<cl::Event>(new cl::Event);
  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_img, event_);
 }
 }  // namespace opencl

--- a/lite/kernels/opencl/elementwise_sub_image_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -46,7 +46,6 @@ class ElementwiseSubImageCompute
  std::string kernel_func_name_{"elementwise_sub"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -123,16 +123,15 @@ class FcCompute
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size_,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
  }
 private:
@@ -145,7 +144,6 @@ class FcCompute
  DDim last_x_dims_;
  cl::NDRange global_work_size_;
  cl::Kernel kernel_;
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -162,17 +162,8 @@ TEST(fc, compute) {
        // run opencl kernel
        kernel->Launch();
-        //       kernel->Launch();
+        CLRuntime::Global()->command_queue().finish();
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-        auto* out_ptr = param.output->data<float, cl::Buffer>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-          auto command_queue = CLRuntime::Global()->command_queue();
-          command_queue.finish();
 #if 0
          double start_nanos =
              event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
@@ -181,10 +172,6 @@ TEST(fc, compute) {
          double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
          LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
 #endif
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-        }
        std::vector<float> out_data_from_gpu(out_dim.production());
        TargetWrapperCL::MemcpySync(

--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -130,16 +130,15 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size_,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 protected:
@@ -154,7 +153,6 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/grid_sampler_image_compute_test.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
@@ -191,17 +191,7 @@ TEST(grid_samler_image2d, compute) {
          // LOG(INFO) << "out_image:" << out_image;
          kernel->Launch();
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          CLRuntime::Global()->command_queue().finish();
-          auto* out_ptr = param.out->data<half_t, cl::Image2D>();
-          auto it = wait_list->find(out_ptr);
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL)
-                << "Could not find the sync event for the target cl tensor.";
-          }
          std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
          gird_sampler_ref(

--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -137,16 +137,14 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(7, *out_img);
    CL_CHECK_FATAL(status);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        local_work_size,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 #else  // paddle version
@@ -260,16 +258,14 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(arg_idx++, in_w);
    CL_CHECK_FATAL(status);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        local_work_size,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 #endif
@@ -278,7 +274,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{"instance_norm_onnx"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
  Tensor scale_image_;
  Tensor bias_image_;
 };

--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -105,20 +105,11 @@ class IoCopykOpenCLToHostCompute
    }
    auto& context = ctx_->As<OpenCLContext>();
-    auto* wait_list = context.cl_wait_list();
-    auto it = wait_list->find(x_ptr);
-    if (it != wait_list->end()) {
 #ifndef LITE_SHUTDOWN_LOG
    VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
 #endif
-      auto& event = *(it->second);
+    CLRuntime::Global()->command_queue().finish();
-      event.wait();
-      auto command_queue = CLRuntime::Global()->command_queue();
-      command_queue.finish();
-    } else {
-      LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    }
    CopyToHostSync(data, param.x->raw_data(), mem_size);
  }

--- a/lite/kernels/opencl/io_copy_buffer_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
@@ -66,7 +66,6 @@ TEST(io_copy, compute) {
  h2d_kernel->Launch();
  auto* event_key = d_y.data<float, cl::Buffer>();
  std::shared_ptr<cl::Event> event(new cl::Event);
-  context->As<OpenCLContext>().cl_wait_list()->emplace(event_key, event);
  d2h_kernel->Launch();
  auto* h_y_data = h_y.data<float>();

--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -44,8 +44,10 @@ class LayoutComputeBufferChwToImageDefault
    }
    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
+    context.cl_context()->AddKernel(kernel_func_name_,
-        kernel_func_name_, "image/layout_kernel.cl", build_options_);
+                                    "image/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
  }
  void Run() override {
@@ -95,7 +97,7 @@ class LayoutComputeBufferChwToImageDefault
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int arg_idx = 0;
@@ -122,16 +124,15 @@ class LayoutComputeBufferChwToImageDefault
        cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                    static_cast<cl::size_type>(new_dims[3]),
                    static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
  }
  std::string doc() const override {
@@ -140,9 +141,9 @@ class LayoutComputeBufferChwToImageDefault
  }
 private:
+  std::string time_stamp_{GetTimeStamp()};
  std::string kernel_func_name_{"buffer_to_image2d"};
  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 // [ImageDefault] -> [NCHW]
@@ -158,8 +159,10 @@ class LayoutComputeImageDefaultToBufferChw
    }
    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
+    context.cl_context()->AddKernel(kernel_func_name_,
-        kernel_func_name_, "image/layout_kernel.cl", build_options_);
+                                    "image/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
  }
  void Run() override {
@@ -202,7 +205,7 @@ class LayoutComputeImageDefaultToBufferChw
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int arg_idx = 0;
@@ -230,16 +233,15 @@ class LayoutComputeImageDefaultToBufferChw
        cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                    static_cast<cl::size_type>(new_dims[3]),
                    static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
  }
  std::string doc() const override {
@@ -248,9 +250,9 @@ class LayoutComputeImageDefaultToBufferChw
  }
 private:
+  std::string time_stamp_{GetTimeStamp()};
  std::string kernel_func_name_{"image2d_to_buffer"};
  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 // [NCHW] -> [ImageDW]
@@ -263,8 +265,10 @@ class LayoutComputeBufferChwToImage2DNw
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
+    context.cl_context()->AddKernel(kernel_func_name_,
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+                                    "buffer/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
  }
  void Run() override {
@@ -298,7 +302,7 @@ class LayoutComputeBufferChwToImage2DNw
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int arg_idx = 0;
@@ -325,16 +329,15 @@ class LayoutComputeBufferChwToImage2DNw
        cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
                    static_cast<cl::size_type>(out_W),            // w
                    static_cast<cl::size_type>(out_C * out_H)};   // ch
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(y_data, event_);
  }
  std::string doc() const override {
@@ -342,9 +345,10 @@ class LayoutComputeBufferChwToImage2DNw
  }
 private:
+  std::string time_stamp_{GetTimeStamp()};
  std::string kernel_func_name_{"buffer_to_image2d_nw"};
  std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/layout_image_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
@@ -246,20 +246,7 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) {
          LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
          img_to_buf_kernel->Launch();
-          // wait for opencl
+          CLRuntime::Global()->command_queue().finish();
-          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-          auto it = wait_list->find(out_ptr);
-          if (it != wait_list->end()) {
-            VLOG(4) << "--- Find the sync event for the target cl "
-                       "tensor. ---";
-            auto& event = *(it->second);
-            event.wait();
-          } else {
-            LOG(FATAL) << "Could not find the sync event for the target "
-                          "cl tensor.";
-          }
 // result
 #ifdef PRINT_RESULT

--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -128,16 +128,14 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
@@ -154,7 +152,6 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{"lrn"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/lrn_image_compute_test.cc
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
@@ -181,19 +181,7 @@ TEST(lrn_image2d, compute) {
                    // LOG(INFO) << "out_image:" << out_image;
                    kernel->Launch();
-                    auto* wait_list =
+                    CLRuntime::Global()->command_queue().finish();
-                        context->As<OpenCLContext>().cl_wait_list();
-                    auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-                    auto it = wait_list->find(out_ptr);
-                    if (it != wait_list->end()) {
-                      VLOG(4) << "--- Find the sync event for the target cl "
-                                 "tensor. ---";
-                      auto& event = *(it->second);
-                      event.wait();
-                    } else {
-                      LOG(FATAL) << "Could not find the sync event for the "
-                                    "target cl tensor.";
-                    }
                    std::unique_ptr<float[]> out_ref(
                        new float[out_dim.production()]);

--- a/lite/kernels/opencl/mul_buffer_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -91,16 +91,15 @@ class MulCompute
    auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                        static_cast<size_t>((n_ + 3) / 4)};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
  }
 private:
@@ -108,7 +107,6 @@ class MulCompute
  std::string kernel_func_name_{"mat_mul"};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/mul_buffer_compute_test.cc
+++ b/lite/kernels/opencl/mul_buffer_compute_test.cc
@@ -123,17 +123,7 @@ TEST(mul, compute) {
        // run opencl kernel
        kernel->Launch();
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+        CLRuntime::Global()->command_queue().finish();
-        auto* out_ptr = param.output->data<float, cl::Buffer>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-        }
        // run cpu ref
        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));

--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -109,23 +109,21 @@ class NearestInterpComputeImageDefault
        cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
                    static_cast<cl::size_type>(default_work_size.data()[1]),
                    static_cast<cl::size_type>(default_work_size.data()[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
  std::string kernel_func_name_{"nearest_interp"};
  std::string build_options_{" -DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -208,20 +208,7 @@ TEST(nearest_interp_image2d, compute) {
              LOG(INFO) << "run kernel: img_to_buf_kernel";
              img_to_buf_kernel->Launch();
-              // wait for opencl
+              CLRuntime::Global()->command_queue().finish();
-              auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
-              auto it = wait_list->find(out_ptr);
-              if (it != wait_list->end()) {
-                VLOG(4) << "--- Find the sync event for the target cl "
-                           "tensor. ---";
-                auto &event = *(it->second);
-                event.wait();
-              } else {
-                LOG(FATAL) << "Could not find the sync event for the target "
-                              "cl tensor.";
-              }
              // compute ref cpu
              for (int nid = 0; nid < x_dim[0]; ++nid) {

--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -142,16 +142,14 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size[1]),
                    static_cast<cl::size_type>(default_work_size[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
 #ifndef LITE_SHUTDOWN_LOG
    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
            << global_work_size[1] << " " << global_work_size[2];
@@ -163,7 +161,6 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
  std::string kernel_func_name_{};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/pad2d_image_compute_test.cc
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -262,22 +262,8 @@ TEST(pad2d_image2d, compute) {
                      img_to_buf_kernel->Launch();
                      // wait for opencl
-                      auto *wait_list =
-                          context->As<OpenCLContext>().cl_wait_list();
-                      auto *out_ptr =
-                          ImageToBufferParam.y->data<float, cl::Buffer>();
-                      auto it = wait_list->find(out_ptr);
-                      if (it != wait_list->end()) {
+                      CLRuntime::Global()->command_queue().finish();
-                        VLOG(4) << "--- Find the sync event for the target cl "
-                                   "tensor. ---";
-                        auto &event = *(it->second);
-                        event.wait();
-                      } else {
-                        LOG(FATAL)
-                            << "Could not find the sync event for the target "
-                               "cl tensor.";
-                      }
                      // compute ref cpu
                      pad2d_ref(mapped_x,

--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -105,23 +105,21 @@ class PoolCompute
    status = kernel.setArg(++arg_idx, *output_buf);
    CL_CHECK_FATAL(status);
    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
  }
 private:
  std::string kernel_func_name_{"pool_"};
  std::string build_options_{"-DCL_DTYPE_float"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
@@ -119,16 +119,7 @@ TEST(pool2d_buffer_fp32, compute) {
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);

--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -150,23 +150,20 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
    CL_CHECK_FATAL(status);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
  std::string kernel_func_name_{"pool_"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/pool_image_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
@@ -137,16 +137,7 @@ TEST(pool2d_image2d, compute) {
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim);

--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -154,23 +154,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
                    static_cast<size_t>(default_work_size.data()[1]),
                    static_cast<size_t>(default_work_size.data()[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_image, event_);
  }
 private:
  std::string kernel_func_name_{"reshape"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/reshape_image_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
@@ -184,17 +184,7 @@ TEST(reshape_opencl, compute) {
  LOG(INFO) << "kernel launch ...";
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_image);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
  TargetWrapperCL::ImgcpySync(out_image_data,

--- a/lite/kernels/opencl/scale_image_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -93,23 +93,20 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
    status = kernel.setArg(3, bias);
    CL_CHECK_FATAL(status);
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size_,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
  std::string kernel_func_name_{"scale"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
  param_t* scale_param_{nullptr};
  cl::Kernel kernel_;

--- a/lite/kernels/opencl/scale_image_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
@@ -88,16 +88,7 @@ TEST(scale_image2d_fp32, compute) {
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  scale(input_v.data(), in_dim, out_ref.get(), 1.5f, 0.3f);

--- a/lite/kernels/opencl/slice_image_compute.cc
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -96,23 +96,20 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
                    static_cast<cl::size_type>(default_work_size.data()[1]),
                    static_cast<cl::size_type>(default_work_size.data()[2])};
-    event_ = std::shared_ptr<cl::Event>(new cl::Event);
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
        global_work_size,
        cl::NullRange,
        nullptr,
-        event_.get());
+        nullptr);
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_img, event_);
  }
 private:
  std::string kernel_func_name_{"slice"};
  std::string build_options_{"-DCL_DTYPE_half"};
  std::string time_stamp_{GetTimeStamp()};
-  std::shared_ptr<cl::Event> event_{nullptr};
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
@@ -98,16 +98,7 @@ TEST(slice_image2d_fp16, compute) {
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  CLRuntime::Global()->command_queue().finish();
-  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
  slice_channel(input_v.data(), in_dim, out_ref.get(), 2, 5);