[LITE][OPENCL] replace vlog with log in pool, add each run duration print in test (#3010)

* replace vlog with log in pool, add each run duration print in test. test=develop * change layout file path. test=develop * fix act, layout image kernel. test=develop * fix spell error. test=develop * remove clfinish in act, concat, layout, nearest_interp. test=develop * add RELU macro define in fc opencl kernel. test=develop * add cpu_ref print in activation opencl kernel.test=develop * fix layout ut. test=develop * replace log with vlog. test=develop * fix get output. test=develop

[LITE][OPENCL] replace vlog with log in pool, add each run duration print in test (#3010)
* replace vlog with log in pool, add each run duration print in test. test=develop * change layout file path. test=develop * fix act, layout image kernel. test=develop * fix spell error. test=develop * remove clfinish in act, concat, layout, nearest_interp. test=develop * add RELU macro define in fc opencl kernel. test=develop * add cpu_ref print in activation opencl kernel.test=develop * fix layout ut. test=develop * replace log with vlog. test=develop * fix get output. test=develop
5014c3ce · Yuan Shuai · GitHub · 31858fb0 · 5014c3ce · 5014c3ce
18 changed file
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places,
    predictor.Run();
  }

-  auto start = GetCurrentUS();
+  double sum_duration = 0.0;  // millisecond;
  for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
    predictor.Run();
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    VLOG(1) << "run_idx:" << i << " " << duration << " ms";
  }

  if (save_model) {
@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places,
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";

  std::vector<std::vector<float>> ref;
  ref.emplace_back(std::vector<float>(
@@ -115,13 +118,11 @@ void TestModel(const std::vector<Place>& valid_places,
  }

  // Get detailed result
-  auto* pred = &predictor;
-  size_t output_tensor_num = pred->GetOutputNames().size();
+  size_t output_tensor_num = predictor.GetOutputNames().size();
  VLOG(1) << "output tesnor num:" << output_tensor_num;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
-    std::unique_ptr<const Tensor> output_tensor(
-        std::move(pred->GetOutput(tidx)));
+    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
    VLOG(1) << "out_dims:" << out_dims;

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places,
    predictor.Run();
  }

-  auto start = GetCurrentUS();
+  double sum_duration = 0.0;  // millisecond;
  for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
    predictor.Run();
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    VLOG(1) << "run_idx:" << i << " " << duration << " ms";
  }

  if (save_model) {
@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places,
  LOG(INFO) << "================== Speed Report ===================";
  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";

  std::vector<std::vector<float>> ref;
  // i = 1
@@ -117,13 +120,11 @@ void TestModel(const std::vector<Place>& valid_places,
  }

  // Get detailed result
-  auto* pred = &predictor;
-  size_t output_tensor_num = pred->GetOutputNames().size();
+  size_t output_tensor_num = predictor.GetOutputNames().size();
  VLOG(1) << "output tesnor num:" << output_tensor_num;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
-    std::unique_ptr<const Tensor> output_tensor(
-        std::move(pred->GetOutput(tidx)));
+    auto* output_tensor = predictor.GetOutput(tidx);
    VLOG(1) << "============= output tensor " << tidx << " =============\n";
    auto out_dims = output_tensor->dims();
    VLOG(1) << "out_dims:" << out_dims;

--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -18,7 +18,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_
 add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
@@ -68,7 +68,7 @@ lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_comput
             DEPS elementwise_mul_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)

-lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
+lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
             DEPS layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)


--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -44,9 +44,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<half_t, cl::Image2D>();
+    auto* x_img = param.X->data<half_t, cl::Image2D>();
    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
        image_shape["width"], image_shape["height"]);
    const auto& y_dims = param.Out->dims();  // useless: check dim only

@@ -57,9 +57,9 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());

    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    cl_int status = kernel.setArg(arg_idx, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(++arg_idx, *out_img);
    CL_CHECK_FATAL(status);

    VLOG(4) << TargetToStr(param.X->target());
@@ -82,9 +82,7 @@ class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(out_img, event_);
  }

 private:
@@ -112,9 +110,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<half_t, cl::Image2D>();
+    auto* x_img = param.X->data<half_t, cl::Image2D>();
    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
        image_shape["width"], image_shape["height"]);
    const auto& y_dims = param.Out->dims();  // useless: check dim only
    auto threshold = param.Relu_clipped_coef;
@@ -126,9 +124,9 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());

    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    cl_int status = kernel.setArg(arg_idx, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(++arg_idx, *out_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, threshold);
    CL_CHECK_FATAL(status);
@@ -154,9 +152,7 @@ class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(out_img, event_);
  }

 private:
@@ -185,11 +181,11 @@ class SigmoidComputeImageDefault
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
-    auto* x_buf =
+    auto* x_img =
        param.X->data<half_t,
                      cl::Image2D>();  // use half_t represents half float
    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(  // use half_t
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(  // use half_t
        // represents half float
        image_shape["width"],
        image_shape["height"]);
@@ -202,9 +198,9 @@ class SigmoidComputeImageDefault
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());

    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    cl_int status = kernel.setArg(arg_idx, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(++arg_idx, *out_img);
    CL_CHECK_FATAL(status);

    VLOG(4) << TargetToStr(param.X->target());
@@ -227,9 +223,7 @@ class SigmoidComputeImageDefault
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(out_img, event_);
  }

 private:

--- a/lite/kernels/opencl/activation_image_compute_test.cc
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (1e0)

 namespace paddle {
 namespace lite {
@@ -58,8 +61,8 @@ TEST(relu_image2d_fp16, compute) {
               "-> host";

 #ifdef RELU_FP16_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
+  for (int n = 1; n <= 2; n += 1) {
+    for (auto c : {1}) {
      for (int h = 12; h <= 100; h += 13) {
        for (int w = 12; w <= 100; w += 25) {
 #else
@@ -169,6 +172,21 @@ TEST(relu_image2d_fp16, compute) {
          LOG(INFO) << "run kernel: img_to_buf_kernel";
          img_to_buf_kernel->Launch();

+          // wait for opencl
+          auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto &event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
          // compute ref cpu
          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
 // result
@@ -176,18 +194,24 @@ TEST(relu_image2d_fp16, compute) {
          LOG(INFO) << "---- print kernel result (input -> output) ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
+                      << ", ref: " << y_data_ref[eidx] << std::endl;
          }
 #endif  // RELU_FP16_PRINT_RESULT

          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            auto abs_diff = COMPUTE_ABS_DIFF(y_data_ref[eidx], mapped_y[eidx]);
+            auto relative_diff =
+                COMPUTE_RELATIVE_DIFF(y_data_ref[eidx], mapped_y[eidx]);
+            EXPECT_EQ(
+                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+                true);
+            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+              LOG(ERROR) << "error idx:" << eidx << ", y_data_ref[" << eidx
+                         << "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx
+                         << "]:" << mapped_y[eidx] << " abs_diff:" << abs_diff
+                         << " relative_diff:" << relative_diff
+                         << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
              break;
            }
          }
@@ -206,7 +230,7 @@ TEST(relu_image2d_fp16, compute) {
 #endif
 }

-// #define RELU6_FP16_LOOP_TEST
+//  #define RELU6_FP16_LOOP_TEST
 // #define RELU6_FP16_PRINT_RESULT
 TEST(relu6_image2d_fp16, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
@@ -287,7 +311,7 @@ TEST(relu6_image2d_fp16, compute) {
          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
              y_data, 0, sizeof(float) * x_dim.production()));
          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2 * 0.1;
            mapped_y[i] = static_cast<int>(0);
          }
          auto *relu_in_data = relu_in.mutable_data<half_t, cl::Image2D>(
@@ -326,6 +350,21 @@ TEST(relu6_image2d_fp16, compute) {
          LOG(INFO) << "run kernel: img_to_buf_kernel";
          img_to_buf_kernel->Launch();

+          // wait for opencl
+          auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto &event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
          // compute ref cpu
          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
 // result
@@ -333,14 +372,14 @@ TEST(relu6_image2d_fp16, compute) {
          LOG(INFO) << "---- print kernel result (input -> output) ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
+                      << ", ref: " << y_data_ref[eidx] << std::endl;
          }
 #endif  // RELU6_FP16_PRINT_RESULT

          // check result: compare kernel output and cpu output(y_data_ref)
          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
                        << " / " << x_dim.production() << ", y_data_ref["
                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
@@ -485,6 +524,21 @@ TEST(sigmoid_image2d_fp16, compute) {
          LOG(INFO) << "run kernel: img_to_buf_kernel";
          img_to_buf_kernel->Launch();

+          // wait for opencl
+          auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto &event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
          // compute ref cpu
          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
 // result
@@ -492,14 +546,14 @@ TEST(sigmoid_image2d_fp16, compute) {
          LOG(INFO) << "---- print kernel result (input -> output) ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
+                      << ", ref:" << y_data_ref[eidx] << std::endl;
          }
 #endif  // SIGMOID_FP16_PRINT_RESULT

          // check result: compare kernel output and cpu output(y_data_ref)
          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > FP16_MAX_DIFF) {
              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
                        << " / " << x_dim.production() << ", y_data_ref["
                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["

--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -109,25 +109,28 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
    int arg_idx = 0;
    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];

-    LOG(INFO) << "concat 输入尺寸:  ";
+    VLOG(4) << "concat 输入尺寸:  ";
    for (size_t i = 0; i < inputs.size(); i++) {
-      LOG(INFO) << "inputs [" << i << "]"
-                << "[" << inputs[i]->dims().size() << "D]:"
-                << "   dims:" << inputs[i]->dims()[0] << " "
-                << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
-                << inputs[i]->dims()[3];
+      VLOG(4) << "inputs [" << i << "]"
+              << "[" << inputs[i]->dims().size() << "D]:"
+              << "   dims:" << inputs[i]->dims()[0] << " "
+              << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
+              << inputs[i]->dims()[3];
    }
-    LOG(INFO) << "concat 输出尺寸:  ";
-    LOG(INFO) << " out  dims:  "
-              << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
-              << " " << x_dims[2] << " " << x_dims[3];
-    LOG(INFO) << "axis_: " << axis_;
-    LOG(INFO) << "flag_: " << flag_;
+
+    VLOG(4) << "concat 输出尺寸:  ";
+    VLOG(4) << " out  dims:  "
+            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
+            << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "axis_: " << axis_;
+    VLOG(4) << "flag_: " << flag_;
+
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
                    static_cast<cl::size_type>(image_shape["width"] /
                                               x_dims[x_dims.size() - 1]),
                    static_cast<cl::size_type>(image_shape["height"])};
+
    VLOG(4) << TargetToStr(param.output->target());
    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
            << image_shape["height"];
@@ -136,16 +139,17 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
            << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    LOG(INFO) << "width_: " << width_ << ", flag_: " << flag_;
+    VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
            << (image_shape["height"]);
+
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
    int out_w = x_dims[x_dims.size() - 1];
    int out_c = x_dims[1];
    if (inputs.size() == 2) {
-      auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
-      auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
+      auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
+      auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
      cl_int status = kernel.setArg(arg_idx, *x_buf0);
      CL_CHECK_FATAL(status);
      status = kernel.setArg(++arg_idx, *x_buf1);
@@ -171,14 +175,14 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
          nullptr,
          event_.get());
      CL_CHECK_FATAL(status);
-      context.cl_context()->GetCommandQueue().finish();
+      context.cl_wait_list()->emplace(out_buf, event_);
    } else {
      auto start = 0;
      for (int i = 0; i < inputs.size(); i++) {
        arg_idx = 0;
        auto in_dims = inputs[i]->dims();
        image_shape = InitImageDimInfoWith(in_dims);
-        auto* x_buf = inputs[i]->data<float, cl::Image2D>();
+        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
        int in_w = in_dims[in_dims.size() - 1];
        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
                << image_shape["height"];
@@ -212,7 +216,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
            nullptr,
            event_.get());
        CL_CHECK_FATAL(status);
-        context.cl_context()->GetCommandQueue().finish();
+        context.cl_wait_list()->emplace(out_buf, event_);
        start += inputs[i]->dims()[axis_];
      }
    }

--- a/lite/kernels/opencl/concat_image_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
@@ -245,6 +245,21 @@ TEST(concat_image2d, compute) {
            LOG(INFO) << "run kernel: img_to_buf_kernel";
            img_to_buf_kernel->Launch();

+            // wait for opencl
+            auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+            auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+            auto it = wait_list->find(out_ptr);
+
+            if (it != wait_list->end()) {
+              VLOG(4) << "--- Find the sync event for the target cl "
+                         "tensor. ---";
+              auto &event = *(it->second);
+              event.wait();
+            } else {
+              LOG(FATAL) << "Could not find the sync event for the target "
+                            "cl tensor.";
+            }
+
            // compute ref cp_u
            std::vector<const float *> ins_ptr;
            std::vector<const DDim> in_dim;

--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -471,7 +471,7 @@ TEST(conv2d, compute_image2d_1x1) {
              for (int i = 0; i < out_dim.production(); i++) {
                auto relative_diff =
                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
                // EXPECT_LT(relative_diff, FP16_MAX_DIFF);
                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                             abs_diff > FP16_ABS_DIFF);
@@ -1191,7 +1191,7 @@ TEST(conv2d, compute_image2d_5x5) {
              for (int i = 0; i < out_dim.production(); i++) {
                auto relative_diff =
                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                             abs_diff > FP16_ABS_DIFF);
                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
@@ -1540,7 +1540,7 @@ TEST(conv2d, compute_image2d_7x7) {
              for (int i = 0; i < out_dim.production(); i++) {
                auto relative_diff =
                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                auto abs_diff = COMPTUE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                             abs_diff > FP16_ABS_DIFF);
                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {

--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -57,6 +57,10 @@ class FcCompute
      global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                      static_cast<size_t>((n_ + 3) / 4)};
    }
+
+    if (param.activation_type == "relu") {
+      build_options_ += "-DRELU";
+    }
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
@@ -107,7 +111,7 @@ class FcCompute
 private:
  int m_, n_, k_;
  std::string kernel_func_name_{};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float "};
  cl::NDRange global_work_size_;
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -103,9 +103,6 @@ class IoCopykOpenCLToHostCompute
    auto* wait_list = context.cl_wait_list();
    auto* x_ptr = param.x->data<float, cl::Buffer>();

-    /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
-    `cl_wait_list`
-    in kernel and `wait_list` enabled
    auto it = wait_list->find(x_ptr);
    if (it != wait_list->end()) {
      VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -114,7 +111,6 @@ class IoCopykOpenCLToHostCompute
    } else {
      LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
    }
-    */

    CopyToHostSync(data, param.x->raw_data(), mem_size);
  }

--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -44,7 +44,7 @@ class LayoutComputeBufferChwToImageDefault
    }
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+        kernel_func_name_, "image/layout_kernel.cl", build_options_);
  }

  void Run() override {
@@ -126,9 +126,7 @@ class LayoutComputeBufferChwToImageDefault
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(y_data, event_);
  }

  std::string doc() const override {
@@ -155,7 +153,7 @@ class LayoutComputeImageDefaultToBufferChw
    }
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+        kernel_func_name_, "image/layout_kernel.cl", build_options_);
  }

  void Run() override {
@@ -229,9 +227,7 @@ class LayoutComputeImageDefaultToBufferChw
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(y_data, event_);
  }

  std::string doc() const override {
@@ -325,10 +321,7 @@ class LayoutComputeBufferChwToImage2DNw
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
-    //    auto image_shape = InitImageDimInfoWith(x_dims);
+    context.cl_wait_list()->emplace(y_data, event_);
  }

  std::string doc() const override {

--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (1e0)

 namespace paddle {
 namespace lite {
@@ -86,7 +89,7 @@ TEST(layout_ImageDefault, compute) {
          auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
              y_data, 0, sizeof(float) * x_dim.production()));
          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(i) * 2;
+            mapped_x[i] = static_cast<float>(i) * 0.01;
          }

          // set context and kernel args
@@ -122,14 +125,19 @@ TEST(layout_ImageDefault, compute) {
 #endif  // PRINT_RESULT

          // check result: compare input and output
-          float MAX_PASS_DIFF = 1e-4;
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
-            if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", mapped_x[" << eidx
-                        << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
-                        << "]:" << mapped_y[eidx];
+          for (int i = 0; i < x_dim.production(); i++) {
+            auto abs_diff = COMPUTE_ABS_DIFF(mapped_x[i], mapped_y[i]);
+            auto relative_diff =
+                COMPUTE_RELATIVE_DIFF(mapped_x[i], mapped_y[i]);
+            EXPECT_EQ(
+                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+                true);
+            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+              LOG(ERROR) << "error idx:" << i << " mapped_x[" << i
+                         << "]:" << mapped_x[i] << " mapped_y[" << i
+                         << "]:" << mapped_y[i] << " abs_diff:" << abs_diff
+                         << " relative_diff:" << relative_diff
+                         << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
              break;
            }
          }
@@ -238,12 +246,27 @@ TEST(layout_ImageDefault_With_Pre_Post, compute) {
          LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
          img_to_buf_kernel->Launch();

+          // wait for opencl
+          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto& event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
 // result
 #ifdef PRINT_RESULT
          LOG(INFO) << "---- print result ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> "
-                      << static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
+            std::cout << +mapped_x[eidx] << " -> "
+                      << +static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
          }
 #endif  // PRINT_RESULT


--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -46,11 +46,11 @@ class NearestInterpComputeImageDefault
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
    const auto& y_dims = param.Out->dims();
-    auto* x_buf =
+    auto* x_img =
        param.X->data<half_t,
                      cl::Image2D>();  // use half_t represents half float
    auto out_image_shape = InitImageDimInfoWith(y_dims);
-    auto* out_buf = param.Out->mutable_data<half_t, cl::Image2D>(  // use half_t
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(  // use half_t
        // represents half float
        out_image_shape["width"],
        out_image_shape["height"]);
@@ -69,9 +69,9 @@ class NearestInterpComputeImageDefault
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());

    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    cl_int status = kernel.setArg(arg_idx, *x_img);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
+    status = kernel.setArg(++arg_idx, *out_img);
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
    CL_CHECK_FATAL(status);
@@ -112,9 +112,7 @@ class NearestInterpComputeImageDefault
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(out_img, event_);
  }

 private:

--- a/lite/kernels/opencl/nearest_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -208,6 +208,21 @@ TEST(nearest_interp_image2d, compute) {
              LOG(INFO) << "run kernel: img_to_buf_kernel";
              img_to_buf_kernel->Launch();

+              // wait for opencl
+              auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                VLOG(4) << "--- Find the sync event for the target cl "
+                           "tensor. ---";
+                auto &event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
              // compute ref cpu
              for (int nid = 0; nid < x_dim[0]; ++nid) {
                for (int cid = 0; cid < x_dim[1]; ++cid) {

--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -69,14 +69,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    CHECK(context.cl_context() != nullptr);

    auto* x_img = param.x->data<half_t, cl::Image2D>();
-    LOG(INFO) << "x_image" << x_img;
+    VLOG(4) << "x_image" << x_img;

    auto out_image_shape = InitImageDimInfoWith(out_dims);
-    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
-              << out_image_shape["height"];
+    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
-    LOG(INFO) << "out_image" << out_img;
+    VLOG(4) << "out_image" << out_img;

    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;

--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -63,7 +63,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
        InitImageDimInfoWith(out_dims);
    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
        out_image_shape.at("width"), out_image_shape.at("height"));
-    LOG(INFO) << "out_dims=   " << out_dims;
+    VLOG(4) << "out_dims=   " << out_dims;

    const std::vector<size_t>& default_work_size = DefaultWorkSize(
        out_dims,

--- a/lite/kernels/opencl/test_helper.h
+++ b/lite/kernels/opencl/test_helper.h
@@ -14,7 +14,7 @@

 #pragma once

-#define COMPTUE_ABS_DIFF(res0, res1) abs(res0 - res1)
+#define COMPUTE_ABS_DIFF(res0, res1) abs(res0 - res1)

 #define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))