[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all...

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all OpenCL image kernel. test=develop (#2964) * [LITE][OPENCL] support fp16 for cl_image_converter, layout, activation image kernel. test=develop * add conv, depthwise and UT. test=develop * add pool, conv, nearest_interp kernel. test=develop * support fp16 for scale, reshape, concat, fc buffer opencl kernel. test=develop * refactor for mul opencl buffer kernel. test=develop * support fp16 for elementwise_mul opecl image kernel. test=develop * support fp16 for elementwise_mul opencl image kernel. test=develop * support fp16 for ele_add, fuse_ele_add_act opencl kernel. test=develop * rename io_copy. test=develop * mobilenetv1,v2 passed on 855. test=develop * fix opt for opencl. test=develop

[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all...
[LITE][OPENCL] support fp16 for cl_image_converter, layout, activation all OpenCL image kernel. test=develop (#2964) * [LITE][OPENCL] support fp16 for cl_image_converter, layout, activation image kernel. test=develop * add conv, depthwise and UT. test=develop * add pool, conv, nearest_interp kernel. test=develop * support fp16 for scale, reshape, concat, fc buffer opencl kernel. test=develop * refactor for mul opencl buffer kernel. test=develop * support fp16 for elementwise_mul opecl image kernel. test=develop * support fp16 for elementwise_mul opencl image kernel. test=develop * support fp16 for ele_add, fuse_ele_add_act opencl kernel. test=develop * rename io_copy. test=develop * mobilenetv1,v2 passed on 855. test=develop * fix opt for opencl. test=develop
8b90a0c7 · Yuan Shuai · GitHub · 6fcad721 · 8b90a0c7 · 8b90a0c7
69 changed file
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -81,7 +81,16 @@ void TestModel(const std::vector<Place>& valid_places,
  auto* out = predictor.GetOutput(0);
  const auto* pdata = out->data<float>();
  int step = 50;
-#ifdef LITE_WITH_NPU
+  // Get target and check result
+  VLOG(1) << "valid_places.size():" << valid_places.size();
+  for (int i = 0; i < valid_places.size(); ++i) {
+    auto p = valid_places[i];
+    VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
+  }
+  auto first_target = valid_places[0].target;
+  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
    double eps = 0.1;
    for (int i = 0; i < ref.size(); ++i) {
@@ -92,7 +101,7 @@ void TestModel(const std::vector<Place>& valid_places,
        EXPECT_LT(diff, eps);
      }
    }
-#else
+  } else {
    ASSERT_EQ(out->dims().size(), 2);
    ASSERT_EQ(out->dims()[0], 1);
    ASSERT_EQ(out->dims()[1], 1000);
@@ -103,7 +112,34 @@ void TestModel(const std::vector<Place>& valid_places,
        EXPECT_NEAR(result, ref[i][j], eps);
      }
    }
-#endif
+  }
+  // Get detailed result
+  auto* pred = &predictor;
+  size_t output_tensor_num = pred->GetOutputNames().size();
+  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(pred->GetOutput(tidx)));
+    VLOG(1) << "============= output tensor " << tidx << " =============\n";
+    auto out_dims = output_tensor->dims();
+    VLOG(1) << "out_dims:" << out_dims;
+    float sum = 0.f;
+    for (int i = 0; i < out_dims.production(); ++i) {
+      sum += output_tensor->data<float>()[i];
+    }
+    VLOG(1) << "out_dims.production():" << out_dims.production();
+    VLOG(1) << "output tensor sum value:" << sum;
+    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    // print result
+    for (int i = 0; i < out_dims.production(); ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
+    }
+  }
 }
 #ifdef LITE_WITH_NPU
@@ -130,7 +166,7 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},

--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -83,7 +83,16 @@ void TestModel(const std::vector<Place>& valid_places,
  auto* out = predictor.GetOutput(0);
  const auto* pdata = out->data<float>();
  int step = 50;
-#ifdef LITE_WITH_NPU
+  // Get target and check result
+  VLOG(1) << "valid_places.size():" << valid_places.size();
+  for (int i = 0; i < valid_places.size(); ++i) {
+    auto p = valid_places[i];
+    VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
+  }
+  auto first_target = valid_places[0].target;
+  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
    ASSERT_EQ(out->dims().production(), 1000);
    double eps = 0.1;
    for (int i = 0; i < ref.size(); ++i) {
@@ -94,16 +103,45 @@ void TestModel(const std::vector<Place>& valid_places,
        EXPECT_LT(diff, eps);
      }
    }
-#else
+  } else {
    ASSERT_EQ(out->dims().size(), 2);
    ASSERT_EQ(out->dims()[0], 1);
    ASSERT_EQ(out->dims()[1], 1000);
+    double eps = 1e-6;
    for (int i = 0; i < ref.size(); ++i) {
      for (int j = 0; j < ref[i].size(); ++j) {
-      EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6);
+        auto result = pdata[j * step + (out->dims()[1] * i)];
+        EXPECT_NEAR(result, ref[i][j], eps);
+      }
+    }
+  }
+  // Get detailed result
+  auto* pred = &predictor;
+  size_t output_tensor_num = pred->GetOutputNames().size();
+  VLOG(1) << "output tesnor num:" << output_tensor_num;
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(pred->GetOutput(tidx)));
+    VLOG(1) << "============= output tensor " << tidx << " =============\n";
+    auto out_dims = output_tensor->dims();
+    VLOG(1) << "out_dims:" << out_dims;
+    float sum = 0.f;
+    for (int i = 0; i < out_dims.production(); ++i) {
+      sum += output_tensor->data<float>()[i];
+    }
+    VLOG(1) << "out_dims.production():" << out_dims.production();
+    VLOG(1) << "output tensor sum value:" << sum;
+    VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
+    // print result
+    for (int i = 0; i < out_dims.production(); ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
    }
  }
-#endif
 }
 #ifdef LITE_WITH_NPU
@@ -130,7 +168,7 @@ TEST(MobileNetV2, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV2, test_opencl) {
  std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
      Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -91,7 +91,7 @@ std::vector<Place> ParserValidPlaces() {
      valid_places.emplace_back(TARGET(kARM));
    } else if (target_repr == "opencl") {
      valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
      valid_places.emplace_back(
          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
      valid_places.emplace_back(

--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -6,7 +6,8 @@ lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
 lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
 lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
 lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
-lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor)
+lite_cc_library(cl_half SRCS cl_half.cc)
+lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half)
 lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)

--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context,
  int width = cl_image.image_dims()[0];
  int height = cl_image.image_dims()[1];
-  float* image_data = new float[height * width * 4];
+  uint16_t* image_data = new uint16_t[height * width * 4];
  cl::Image* image = cl_image.cl_image();
  cl::array<size_t, 3> origin = {0, 0, 0};
  cl::array<size_t, 3> region = {

--- a/lite/backends/opencl/cl_half.cc
+++ b/lite/backends/opencl/cl_half.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/opencl/cl_half.h"
+namespace paddle {
+namespace lite {
+// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+static const uint32_t mantissatable[2048] = {
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+    0x387fc000, 0x387fe000};
+static const uint16_t offsettable[64] = {
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+static const uint32_t exponenttable[64] = {
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
+static const uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
+    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
+    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
+    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
+    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
+    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
+    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
+    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+static const uint8_t shifttable[512] = {
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+half_t Float2Half(float f) {
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
+  return basetable[(v >> 23) & 0x1ff] +
+         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
+}
+float Half2Float(half_t h) {
+  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
+               exponenttable[h >> 10];
+  return *reinterpret_cast<float *>(&v);
+}
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = Float2Half(f_array[i]);
+  }
+}
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = Half2Float(h_array[i]);
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cl_common.h>
+#pragma once
+#include <cstdint>
-__kernel void relu(__read_only image2d_t input,
+namespace paddle {
-                   __write_only image2d_t output) {
+namespace lite {
-  const int x = get_global_id(0); // image_width
+typedef uint16_t half_t;
-  const int y = get_global_id(1); // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+half_t Float2Half(float f);
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+float Half2Float(half_t h);
-  in = max((CL_DTYPE4)(0.0f), in);
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
-}
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/opencl/cl_image.cc
+++ b/lite/backends/opencl/cl_image.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lite/backends/opencl/cl_image.h"
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #include "lite/backends/opencl/cl_utility.h"
 #include "lite/utils/cp_logging.h"
@@ -24,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
  int width = cl_image.image_dims_[0];
  int height = cl_image.image_dims_[1];
-  float* image_data = new float[height * width * 4];
+  uint16_t* image_data = new uint16_t[height * width * 4];
  cl::Image* image = cl_image.cl_image();
  cl::array<size_t, 3> origin = {0, 0, 0};
@@ -123,7 +124,7 @@ void CLImage::InitCLImage(const cl::Context& context,
  VLOG(3) << " begin init cl image ";
  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
-  float* image_data = new float[image_dims_.production() * 4];
+  uint16_t* image_data = new uint16_t[image_dims_.production() * 4];
  VLOG(3) << " convert to image ";
  converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);

--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 void CLImageConverterDefault::NCHWToImage(float *nchw,
-                                          float *image,
+                                          half_t *image,
                                          const DDim &tensor_dim) {
  size_t new_dims[] = {1, 1, 1, 1};
  for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
          if (c < C) {
            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
            // (c % 4);
-            image[i2] = *p;
+            image[i2] = Float2Half(*p);
            i2 += 4;
            p++;
          } else {
@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
  }
 }
-void CLImageConverterDefault::ImageToNCHW(float *image,
+void CLImageConverterDefault::ImageToNCHW(half_t *image,
                                          float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image,
      for (size_t h = 0; h < H; h++) {
        size_t i2 = (i1 << 2) + c % 4;
        for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
+          *p = Half2Float(image[i2]);
          i2 += 4;
          p++;
        }
@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 void CLImageConverterFolder::NCHWToImage(float *tensor,
-                                         float *image,
+                                         half_t *image,
                                         const DDim &tensor_dim) {
  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
      << " Tensor dim is not support!";
@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
    for (size_t h = 0; h < tdim[0]; h++) {
      for (size_t w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
      }
    }
  }
 }
-void CLImageConverterFolder::ImageToNCHW(float *image,
+void CLImageConverterFolder::ImageToNCHW(half_t *image,
                                         float *tensor,
                                         const DDim &image_dim,
                                         const DDim &tensor_dim) {
@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image,
    for (size_t h = 0; h < H; h++) {
      for (size_t w = 0; w < W; w++) {
-        p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
      }
    }
  }
@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 void CLImageConverterNWBlock::NCHWToImage(float *tensor,
-                                          float *image,
+                                          half_t *image,
                                          const DDim &tensor_dim) {
  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
  auto image_dim = InitImageDimInfoWith(tensor_dim);
@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                         w * 4 + n % 4;
          if (n < N) {
-            image[index] = *p;
+            image[index] = Float2Half(*p);
            p++;
          } else {
            image[index] = 0.0;
@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
  VLOG(3) << " init done";
 }
-void CLImageConverterNWBlock::ImageToNCHW(float *image,
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image,
                                          float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image,
        for (size_t w = 0; w < W; ++w) {
          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                         w * 4 + n % 4;
-          *p = image[index];
+          *p = Half2Float(image[index]);
          p++;
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";
@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 void CLImageConverterDWBlock::NCHWToImage(float *tensor,
-                                          float *image,
+                                          half_t *image,
                                          const DDim &tensor_dim) {
  size_t new_dims[] = {1, 1, 1, 1};
  for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
          if (c < C) {
            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
            // (c % 4);
-            image[i2] = *p;
+            image[i2] = Float2Half(*p);
            i2 += 4;
            p++;
          } else {
@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
  }
 }
-void CLImageConverterDWBlock::ImageToNCHW(float *image,
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image,
                                          float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image,
      for (size_t h = 0; h < H; h++) {
        size_t i2 = (i1 << 2) + c % 4;
        for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
+          *p = Half2Float(image[i2]);
          i2 += 4;
          p++;
        }
@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 void CLImageConverterNormal::NCHWToImage(float *tensor,
-                                         float *image,
+                                         half_t *image,
                                         const DDim &tensor_dim) {
  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
      << " Tensor dim is not support!";
@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor,
  default_converter.NCHWToImage(tensor, image, tensor_dim);
 }
-void CLImageConverterNormal::ImageToNCHW(float *image,
+void CLImageConverterNormal::ImageToNCHW(half_t *image,
                                         float *tensor,
                                         const DDim &image_dim,
                                         const DDim &tensor_dim) {
@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
 }
 void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
-                                                  float *image,
+                                                  half_t *image,
                                                  const DDim &tensor_dim) {}
-void CLImageConverterWinoTransWeight::ImageToNCHW(float *image,
+void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image,
                                                  float *tensor,
                                                  const DDim &image_dim,
                                                  const DDim &tensor_dim) {}

--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/core/tensor.h"
 namespace paddle {
@@ -24,10 +25,10 @@ class CLImageConverterBase {
  virtual ~CLImageConverterBase() {}
  virtual void NCHWToImage(float *nchw,
-                           float *image,
+                           half_t *image,
                           const DDim &tensor_dim) = 0;
-  virtual void ImageToNCHW(float *image,
+  virtual void ImageToNCHW(half_t *image,
                           float *nchw,
                           const DDim &image_dim,
                           const DDim &tensor_dim) = 0;
@@ -37,8 +38,8 @@ class CLImageConverterBase {
 class CLImageConverterDefault : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override;
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;
@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;
@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;
@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;
@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;
@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
  void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                   float *tensor,
                   const DDim &image_dim,
                   const DDim &tensor_dim) override;

--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <cl_common.h>
+// #define DEBUG
 // buffer -> image2d
 __kernel void buffer_to_image2d(__global CL_DTYPE *in,
                                __write_only image2d_t output_image,
@@ -27,6 +28,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
  const int out_n = out_nh / out_H;
  const int out_h = out_nh % out_H;
@@ -47,20 +49,83 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
  output_pos.x = out_c * out_W + out_w;
  output_pos.y = out_nh;
-  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
+  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
-  output.x = convert_float(in[input_pos0]);
+  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE);
-  if(out_C - 4 * out_c >= 2){
-    output.y = convert_float(in[input_pos1]);
+  if (out_C - 4 * out_c >= 2) {
+    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
  }
-  if(out_C - 4 * out_c >= 3){
+  if (out_C - 4 * out_c >= 3) {
-    output.z = convert_float(in[input_pos2]);
+    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
  }
-  if(out_C - 4 * out_c >= 4){
+  if (out_C - 4 * out_c >= 4) {
-    output.w = convert_float(in[input_pos3]);
+    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
+  }
+#ifdef DEBUG
+  if (out_w > 2045) {
+    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
+		   out_w,
+           out_C - 4 * out_c,
+           (float)(in[input_pos0]),
+           (float)(in[input_pos1]),
+           (float)(in[input_pos2]),
+           (float)(in[input_pos3]));
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
+           output_pos.x, output_pos.y,
+           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
  }
-  write_imagef(output_image, output_pos, output);
+#endif
+  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
 }
+// image2d -> buffer
+__kernel void image2d_to_buffer(__read_only image2d_t input,
+                                __private const int in_width,
+                                __private const int in_height,
+                                __global CL_DTYPE* out,
+                                __private const int size_ch,
+                                __private const int size_block,
+                                __private const int size_batch,
+                                __private const int C) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+  const sampler_t sampler =
+    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  const int pos_x = mad24(in_c, in_width, in_w);
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+#ifdef DEBUG
+  if (in_w > 2045) {
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
+            pos_x, in_nh,
+           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+  }
+#endif
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
+  if (C - 4 * in_c >= 2) {
+    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
+  }
+  if(C - 4 * in_c >= 3) {
+    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
+  }
+  if(C - 4 * in_c >= 4) {
+    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
+  }
+}
+#if 0
 // buffer -> image2d_nw
 __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
                                   __write_only image2d_t output_image,
@@ -97,55 +162,23 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
  output_pos.y = out_ch;
  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
-  output.x = convert_float(in[input_pos0]);
+  output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]);
  if (out_N - 4 * out_n >= 2) {
-    output.y = convert_float(in[input_pos1]);
+    output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]);
  }
  if (out_N - 4 * out_n >= 3) {
-    output.z = convert_float(in[input_pos2]);
+    output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]);
  }
  if (out_N - 4 * out_n >= 4) {
-    output.w = convert_float(in[input_pos3]);
+    output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]);
  }
-  write_imagef(output_image, output_pos, output);
-}
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
-// image2d -> buffer
-__kernel void image2d_to_buffer(__read_only image2d_t input,
-                                __private const int in_width,
-                                __private const int in_height,
-                                __global CL_DTYPE* out,
-                                __private const int size_ch,
-                                __private const int size_block,
-                                __private const int size_batch,
-                                __private const int C) {
-  const int in_c = get_global_id(0);
-  const int in_w = get_global_id(1);
-  const int in_nh = get_global_id(2);
-  const int in_n = in_nh / in_height;
-  const int in_h = in_nh % in_height;
-  const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh));
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = convert_float(in.x);
-  if (C - 4 * in_c >= 2) {
-    out[index + size_ch] = convert_float(in.y);
-  }
-  if(C - 4 * in_c >= 3) {
-    out[index + size_ch * 2] = convert_float(in.z);
-  }
-  if(C - 4 * in_c >= 4) {
-    out[index + size_ch * 3] = convert_float(in.w);
-  }
 }
+#endif
+#if 0
 // image2d -> buffer
 __kernel void image2d_to_buffer_2d(__private const int in_height,
                                   __private const int in_width,
@@ -157,11 +190,12 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
  const sampler_t sampler =
    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h));
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h));
  const int index = (in_h * in_width + in_w) * 4;
-  out[index] = convert_float(in.x);
+  out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x);
-  out[index + 1] = convert_float(in.y);
+  out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y);
-  out[index + 2] = convert_float(in.z);
+  out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z);
-  out[index + 3] = convert_float(in.w);
+  out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
 }
+#endif
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -29,11 +29,15 @@ limitations under the License. */
 #ifdef CL_DTYPE_float
 #define CL_DTYPE float
 #define CL_DTYPE_CHAR f
+#define CL_COMPUTE_DTYPE half
+#define CL_COMPUTE_DTYPE_CHAR h
 #endif
 #ifdef CL_DTYPE_half
 #define CL_DTYPE half
 #define CL_DTYPE_CHAR h
+#define CL_COMPUTE_DTYPE half
+#define CL_COMPUTE_DTYPE_CHAR h
 #endif
 /////////////////////////////////
@@ -43,6 +47,7 @@ limitations under the License. */
 #define GET_VEC_TYPE(type__, size__) type__##size__
 #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
 #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
+#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
 /////////////////////////////////
 // CONVERT_TYPE_TO

--- a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
@@ -14,6 +14,23 @@ limitations under the License. */
 #include <cl_common.h>
+__kernel void relu(__read_only image2d_t input,
+                   __write_only image2d_t output) {
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = max((CL_DTYPE4)(0.0f), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
 __kernel void relu6(__read_only image2d_t input,
                    __write_only image2d_t output,
                    __private const float threshold){
@@ -30,3 +47,19 @@ __kernel void relu6(__read_only image2d_t input,
  in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
+__kernel void sigmoid(__read_only image2d_t input,
+                      __write_only image2d_t output) {
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
--- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
@@ -12,19 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#include <cl_common.h>
-__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
+__kernel void nearest_interp(__read_only image2d_t input,
-                             __private const int in_dims_w, __private const int out_dims_w) {
+                             __write_only image2d_t output,
+                             __private const float scale_h,
+                             __private const float scale_w,
+                             __private const int in_dims_h,
+                             __private const int out_dims_h,
+                             __private const int in_dims_w,
+                             __private const int out_dims_w) {
  const int c = get_global_id(0);
  const int w = get_global_id(1);
  const int nh = get_global_id(2);
  int2 output_pos;
  output_pos.x = c * out_dims_w + w;
  output_pos.y = nh;
  int out_n = nh / out_dims_h;
  int out_h = nh % out_dims_h;
  int2 input_pos;
  input_pos.x = c * in_dims_w + w / scale_w;
  input_pos.y = out_n * in_dims_h + out_h / scale_h;
@@ -32,6 +42,7 @@ __kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t
  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                            CLK_ADDRESS_CLAMP |
                            CLK_FILTER_NEAREST;
-                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
+  CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y));
-                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data);
 }
--- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <cl_common.h>
-__kernel void sigmoid(__read_only image2d_t input,
-                   __write_only image2d_t output) {
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
-}
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -81,8 +81,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
  return cl_image;
 }
-template <>  // use int16_t represents half float
+template <>  // use uint16_t represents half float
-void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width,
+void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
                                             const size_t cl_image2d_height,
                                             void *host_ptr) {
  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));

--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -178,5 +178,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
 REGISTER_MIR_PASS(type_precision_cast_pass,
                  paddle::lite::mir::PrecisionCastPass)
    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kOpenCL)})
    .BindKernel("calib_once")
    .BindKernel("calib");
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -103,8 +103,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
  return static_cast<const cl::Image2D *>(buffer_->data());
 }
-template <>  // use int16_t represent half float
+template <>  // use uint16_t represent half float
-const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const {
+const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const {
  if (nullptr == buffer_->data()) return nullptr;
  return static_cast<const cl::Image2D *>(buffer_->data());
 }

--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -260,8 +260,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
-template <>  // use int16_t represent half float
+template <>  // use uint16_t represent half float
-const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const;
+const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const;
 #endif
 }  // namespace lite

--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -4,91 +4,136 @@ endif()
 set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
-add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
+#####################
-add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
+# image kernel      #
-add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
+#####################
-add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
+# basic
+add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(fusion_elementwise_add_activation_opencl
-           OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
+           OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
           DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
-lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
+# extra
-             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
+# wait to add ...
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
-             DEPS elementwise_mul_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-             DEPS pool_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
-             DEPS fc_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-# TODO(ysh329): comment for buffer-impl mul
+######################
-#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
+# image kernel test  #
-#        DEPS mul_opencl op_registry program context
+######################
-#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
+             DEPS activation_opencl layout_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
+lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
-             DEPS io_copy_compute_opencl op_registry program context
+             DEPS conv_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-#TODO(ysh329): comment buffer-impl relu
+lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
-lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
+             DEPS conv_opencl op_registry program context
-             DEPS relu_opencl layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc
+lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
-        DEPS sigmoid_opencl layout_opencl op_registry program context
+             DEPS nearest_interp_opencl layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
+lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
-             DEPS depthwise_conv2d_opencl op_registry program context
+             DEPS pool_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
+lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
-             DEPS conv_opencl op_registry program context
+             DEPS scale_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
+lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
             DEPS reshape_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
+lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
-             DEPS conv_opencl op_registry program context
+             DEPS concat_opencl layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
+lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
-        DEPS conv_opencl op_registry program context cl_image_converter
+             DEPS elementwise_mul_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
-        DEPS layout_opencl op_registry program context cl_image_converter
+             DEPS layout_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
+lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
-        DEPS concat_opencl layout_opencl op_registry program context
+             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
-        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
+######################
+# buffer kernel      #
+######################
+# basic
+#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(fusion_elementwise_add_activation_opencl
+#           OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
+#           DEPS elementwise_add_opencl ${cl_kernel_deps})
+add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+# extra
+# wait to add ...
+######################
+# buffer kernel test #
+######################
+#lite_cc_test(test_activation_buffer_opencl SRCS activation_buffer_compute_test.cc
+#             DEPS activation_opencl op_registry program context
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
+#             DEPS conv_opencl op_registry program context
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
+#             DEPS depthwise_conv2d_opencl op_registry program context
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_pool_buffer_opencl SRCS pool_buffer_compute_test.cc
+#             DEPS pool_opencl op_registry program context
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
+#             DEPS concat_opencl op_registry program context
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
+             DEPS fc_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
+lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
-             DEPS scale_opencl op_registry program context
+             DEPS mul_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
+#    DEPS elementwise_add_opencl op_registry program context
+#   ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
+             DEPS io_copy_opencl op_registry program context
             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/activation_buffer_compute.cc
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ReluCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+ private:
+  std::string kernel_func_name_{"relu"};
+  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+class SigmoidCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+  std::string doc() const override {
+    return "Sigmoid using cl::Buffer, kFloat";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+// Relu
+REGISTER_LITE_KERNEL(relu,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::ReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
+// Sigmoid
+REGISTER_LITE_KERNEL(sigmoid,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::SigmoidCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize()
--- a/lite/kernels/opencl/activation_buffer_compute_test.cc
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+namespace paddle {
+namespace lite {
+template <typename dtype>
+void relu_compute_ref(const dtype *x_data,
+                      const DDim &x_dim,
+                      dtype *out_data,
+                      float threshold = 0.f) {
+  if (abs(threshold) < 1e-5) {
+    // relu
+    for (int i = 0; i < x_dim.production(); ++i) {
+      out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
+    }
+  } else {
+    // relu6 or relu with threshold
+    for (int i = 0; i < x_dim.production(); ++i) {
+      auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
+      out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
+    }
+  }
+}
+template <typename dtype>
+void sigmoid_compute_ref(const dtype *x_data,
+                         const DDim &x_dim,
+                         dtype *out_data) {
+  for (int i = 0; i < x_dim.production(); ++i) {
+    out_data[i] = 1 / (1 + expf(-x_data[i]));
+  }
+}
+TEST(opencl_relu_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> relu_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(relu_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(relu_context));
+  kernel->Launch();
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+TEST(opencl_sigmoid_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(sigmoid_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(sigmoid_context));
+  kernel->Launch();
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+}  // namespace lite
+}  // namespace paddle
+// sigmoid buffer
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
+// relu buffer
+USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/sigmoid_compute.cc
+++ b/lite/kernels/opencl/sigmoid_compute.cc
@@ -24,44 +24,55 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-class SigmoidCompute
+class ReluComputeImageDefault : public KernelLite<TARGET(kOpenCL),
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+                                                  PRECISION(kFP16),
+                                                  DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ActivationParam;
  std::string doc() const override {
-    return "Sigmoid using cl::Buffer, kFloat";
+    return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
  }
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+        kernel_func_name_, "image/activation_kernel.cl", build_options_);
  }
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
-    size_t count = x_dims.production();
+    auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.X->data<float, cl::Buffer>();
-    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
    STL::stringstream kernel_key;
    kernel_key << kernel_func_name_ << build_options_;
    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
    int arg_idx = 0;
    cl_int status = kernel.setArg(arg_idx, *x_buf);
    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, (const int)count);
-    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, *out_buf);
    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange{count};
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
        kernel,
        cl::NullRange,
@@ -70,40 +81,42 @@ class SigmoidCompute
        nullptr,
        event_.get());
    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
  }
 private:
-  std::string kernel_func_name_{"sigmoid"};
+  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::string build_options_{"-DCL_DTYPE_half -DRELU"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
-class SigmoidComputeFloatImageDefault
+class Relu6ComputeImageDefault : public KernelLite<TARGET(kOpenCL),
-    : public KernelLite<TARGET(kOpenCL),
+                                                   PRECISION(kFP16),
-                        PRECISION(kFloat),
                                                   DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ActivationParam;
  std::string doc() const override {
-    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat";
+    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
  }
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+        kernel_func_name_, "image/activation_kernel.cl", build_options_);
  }
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto* x_buf = param.X->data<uint16_t, cl::Image2D>();
    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
+    auto* out_buf = param.Out->mutable_data<uint16_t, cl::Image2D>(
        image_shape["width"], image_shape["height"]);
    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    auto threshold = param.Relu_clipped_coef;
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
@@ -116,6 +129,8 @@ class SigmoidComputeFloatImageDefault
    CL_CHECK_FATAL(status);
    status = kernel.setArg(++arg_idx, *out_buf);
    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, threshold);
+    CL_CHECK_FATAL(status);
    VLOG(4) << TargetToStr(param.X->target());
    VLOG(4) << TargetToStr(param.Out->target());
@@ -125,6 +140,7 @@ class SigmoidComputeFloatImageDefault
            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    VLOG(4) << "threshold:" << threshold;
    auto global_work_size =
        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
@@ -143,12 +159,12 @@ class SigmoidComputeFloatImageDefault
  }
 private:
-  std::string kernel_func_name_{"sigmoid"};
+  std::string kernel_func_name_{"relu6"};
-  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
-class SigmoidComputeFP16ImageDefault
+class SigmoidComputeImageDefault
    : public KernelLite<TARGET(kOpenCL),
                        PRECISION(kFP16),
                        DATALAYOUT(kImageDefault)> {
@@ -162,18 +178,18 @@ class SigmoidComputeFP16ImageDefault
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
+        kernel_func_name_, "image/activation_kernel.cl", build_options_);
  }
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
    auto* x_buf =
-        param.X->data<int16_t,
+        param.X->data<uint16_t,
-                      cl::Image2D>();  // use int16_t represents half float
+                      cl::Image2D>();  // use uint16_t represents half float
    auto image_shape = InitImageDimInfoWith(x_dims);
    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+        param.Out->mutable_data<uint16_t, cl::Image2D>(  // use uint16_t
            // represents half float
            image_shape["width"],
            image_shape["height"]);
@@ -227,39 +243,46 @@ class SigmoidComputeFP16ImageDefault
 }  // namespace lite
 }  // namespace paddle
-// REGISTER_LITE_KERNEL(sigmoid,
+// Relu
-//                      kOpenCL,
+REGISTER_LITE_KERNEL(relu,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::SigmoidCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
-REGISTER_LITE_KERNEL(
-    sigmoid,
                     kOpenCL,
-    kFloat,
+                     kFP16,
                     kImageDefault,
-    paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
+                     paddle::lite::kernels::opencl::ReluComputeImageDefault,
                     ImageDefault)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+// Relu6
+REGISTER_LITE_KERNEL(relu6,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::Relu6ComputeImageDefault,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
-REGISTER_LITE_KERNEL(
+// Sigmoid
-    sigmoid,
+REGISTER_LITE_KERNEL(sigmoid,
                     kOpenCL,
                     kFP16,
                     kImageDefault,
-    paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault,
+                     paddle::lite::kernels::opencl::SigmoidComputeImageDefault,
                     ImageDefault)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),

--- a/lite/kernels/opencl/relu_compute_test.cc
+++ b/lite/kernels/opencl/relu_compute_test.cc
@@ -41,224 +41,17 @@ void relu_compute_ref(const dtype *x_data,
  }
 }
-#if 0   // relu_buffer
+template <typename dtype>
-TEST(opencl_relu_buffer, compute) {
+void sigmoid_compute_ref(const dtype *x_data,
-  // prepare data
+                         const DDim &x_dim,
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+                         dtype *out_data) {
-  lite::Tensor x, out;
-  x.Resize(x_dim);
-  out.Resize(x_dim);
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-  // set param and kernel, then run
-  operators::ActivationParam param;
-  param.X = &x;
-  param.Out = &out;
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> relu_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(relu_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(relu_context));
-  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-  // run compute ref and check
-  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
-  relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-}
-#endif  // relu_buffer
-// #define LOOP_TEST
-// #define PRINT_RESULT
-TEST(relu_image2d_fp32, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
-               "layout(img2buf) "
-               "-> host";
-#ifdef LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // LOOP_TEST
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
-              KernelRegistry::Global().Create("relu",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
-          // relu(img): relu_in -> relu_out
-          // layout(img->buf): relu_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
-          ImageToBufferParam.x = &relu_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
-          ReluParam.X = &relu_in;
-          ReluParam.Out = &relu_out;
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
  for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+    out_data[i] = 1 / (1 + expf(-x_data[i]));
-            mapped_y[i] = static_cast<int>(0);
-          }
-          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-          relu_img_kernel->SetParam(ReluParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          relu_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // PRINT_RESULT
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
-              break;
-            }
  }
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
 }
+// #define RELU_FP16_LOOP_TEST
+// #define RELU_FP16_PRINT_RESULT
 TEST(relu_image2d_fp16, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
               "layout(img2buf) "
@@ -340,9 +133,9 @@ TEST(relu_image2d_fp16, compute) {
            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
            mapped_y[i] = static_cast<int>(0);
          }
-          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
+          auto *relu_in_data = relu_in.mutable_data<uint16_t, cl::Image2D>(
              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
+          auto *relu_out_data = relu_out.mutable_data<uint16_t, cl::Image2D>(
              relu_image2d_shape["width"], relu_image2d_shape["height"]);
          // set context and kernel args
@@ -413,14 +206,14 @@ TEST(relu_image2d_fp16, compute) {
 #endif
 }
-// #define RELU6_FP32_LOOP_TEST
+// #define RELU6_FP16_LOOP_TEST
-// #define RELU6_FP32_PRINT_RESULT
+// #define RELU6_FP16_PRINT_RESULT
-TEST(relu6_image2d_fp32, compute) {
+TEST(relu6_image2d_fp16, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
               "layout(img2buf) "
               "-> host";
-#ifdef RELU6_FP32_LOOP_TEST
+#ifdef RELU6_FP16_LOOP_TEST
  for (int n = 1; n <= 100; n += 33) {
    for (auto c : {1, 3}) {
      for (int h = 12; h <= 100; h += 13) {
@@ -430,7 +223,7 @@ TEST(relu6_image2d_fp32, compute) {
          const int c = 2;
          const int h = 3;
          const int w = 4;
-#endif  // RELU6_FP32_LOOP_TEST
+#endif  // RELU6_FP16_LOOP_TEST
          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
                    << h << " " << w << " ========";
@@ -445,7 +238,7 @@ TEST(relu6_image2d_fp32, compute) {
          auto relu_img_kernels =
              KernelRegistry::Global().Create("relu6",
                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                              DATALAYOUT(kImageDefault));
          ASSERT_FALSE(buf_to_img_kernels.empty());
          ASSERT_FALSE(buf_to_img_kernels.empty());
@@ -497,9 +290,9 @@ TEST(relu6_image2d_fp32, compute) {
            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
            mapped_y[i] = static_cast<int>(0);
          }
-          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
+          auto *relu_in_data = relu_in.mutable_data<uint16_t, cl::Image2D>(
              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
+          auto *relu_out_data = relu_out.mutable_data<uint16_t, cl::Image2D>(
              relu_image2d_shape["width"], relu_image2d_shape["height"]);
          // set context and kernel args
@@ -536,13 +329,13 @@ TEST(relu6_image2d_fp32, compute) {
          // compute ref cpu
          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
 // result
-#ifdef RELU6_FP32_PRINT_RESULT
+#ifdef RELU6_FP16_PRINT_RESULT
          LOG(INFO) << "---- print kernel result (input -> output) ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
                      << std::endl;
          }
-#endif  // RELU6_FP32_PRINT_RESULT
+#endif  // RELU6_FP16_PRINT_RESULT
          // check result: compare kernel output and cpu output(y_data_ref)
          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
@@ -560,7 +353,7 @@ TEST(relu6_image2d_fp32, compute) {
          LOG(INFO) << "free: unmap x, y";
          TargetWrapperCL::Unmap(x_data, mapped_x);
          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef RELU6_FP32_LOOP_TEST
+#ifdef RELU6_FP16_LOOP_TEST
        }  // w
      }    // h
    }      // c
@@ -570,14 +363,14 @@ TEST(relu6_image2d_fp32, compute) {
 #endif
 }
-// #define RELU6_FP16_LOOP_TEST
+// #define SIGMOID_FP16_LOOP_TEST
-// #define RELU6_FP16_PRINT_RESULT
+// #define SIGMOID_FP16_PRINT_RESULT
-TEST(relu6_image2d_fp16, compute) {
+TEST(sigmoid_image2d_fp16, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
               "layout(img2buf) "
               "-> host";
-#ifdef RELU6_FP16_LOOP_TEST
+#ifdef SIGMOID_FP16_LOOP_TEST
  for (int n = 1; n <= 100; n += 33) {
    for (auto c : {1, 3}) {
      for (int h = 12; h <= 100; h += 13) {
@@ -587,7 +380,7 @@ TEST(relu6_image2d_fp16, compute) {
          const int c = 2;
          const int h = 3;
          const int w = 4;
-#endif  // RELU6_FP16_LOOP_TEST
+#endif  // SIGMOID_FP16_LOOP_TEST
          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
                    << h << " " << w << " ========";
@@ -599,46 +392,45 @@ TEST(relu6_image2d_fp16, compute) {
                                              DATALAYOUT(kImageDefault));
          auto img_to_buf_kernels = KernelRegistry::Global().Create(
              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
+          auto sigmoid_img_kernels =
-              KernelRegistry::Global().Create("relu6",
+              KernelRegistry::Global().Create("sigmoid",
                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                              DATALAYOUT(kImageDefault));
          ASSERT_FALSE(buf_to_img_kernels.empty());
          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
+          ASSERT_FALSE(sigmoid_img_kernels.empty());
          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
          // set tensors about op param
          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
+          // layout(buf->img): x -> sigmoid_in
-          // relu(img): relu_in -> relu_out
+          // sigmoid(img): sigmoid_in -> sigmoid_out
-          // layout(img->buf): relu_out -> y
+          // layout(img->buf): sigmoid_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
          operators::LayoutParam BufferToImageParam;
          operators::LayoutParam ImageToBufferParam;
          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
+          BufferToImageParam.y = &sigmoid_in;
-          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.x = &sigmoid_out;
          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
+          operators::ActivationParam SigmoidParam;
-          ReluParam.X = &relu_in;
+          SigmoidParam.X = &sigmoid_in;
-          ReluParam.Out = &relu_out;
+          SigmoidParam.Out = &sigmoid_out;
-          ReluParam.Relu_clipped_coef = 6.f;
          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
          x.Resize(x_dim);
          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
+          sigmoid_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
+          sigmoid_out.Resize(x_dim);
          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
+          auto sigmoid_image2d_shape =
              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
          // initialize tensors
@@ -650,14 +442,19 @@ TEST(relu6_image2d_fp16, compute) {
              x_data, 0, sizeof(float) * x_dim.production()));
          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
              y_data, 0, sizeof(float) * x_dim.production()));
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_x[i] = static_cast<float>(dist(engine));
-            mapped_y[i] = static_cast<int>(0);
          }
-          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
+          auto *sigmoid_in_data =
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+              sigmoid_in.mutable_data<uint16_t, cl::Image2D>(
-          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
+                  sigmoid_image2d_shape["width"],
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+                  sigmoid_image2d_shape["height"]);
+          auto *sigmoid_out_data =
+              sigmoid_out.mutable_data<uint16_t, cl::Image2D>(
+                  sigmoid_image2d_shape["width"],
+                  sigmoid_image2d_shape["height"]);
          // set context and kernel args
          LOG(INFO) << "set context and kernel args";
@@ -676,39 +473,40 @@ TEST(relu6_image2d_fp16, compute) {
              &(img_to_buf_context->As<OpenCLContext>()));
          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-          relu_img_kernel->SetParam(ReluParam);
+          sigmoid_img_kernel->SetParam(SigmoidParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
+              &(sigmoid_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
+          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
          // run kernels
          LOG(INFO) << "run kernel: buf_to_img_kernel";
          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
+          LOG(INFO) << "run kernel: sigmoid_img_kernel";
-          relu_img_kernel->Launch();
+          sigmoid_img_kernel->Launch();
          LOG(INFO) << "run kernel: img_to_buf_kernel";
          img_to_buf_kernel->Launch();
          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
+          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
 // result
-#ifdef RELU6_FP16_PRINT_RESULT
+#ifdef SIGMOID_FP16_PRINT_RESULT
          LOG(INFO) << "---- print kernel result (input -> output) ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
                      << std::endl;
          }
-#endif  // RELU6_FP16_PRINT_RESULT
+#endif  // SIGMOID_FP16_PRINT_RESULT
          // check result: compare kernel output and cpu output(y_data_ref)
          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
+                        << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
+                        << eidx << "]: " << mapped_x[eidx];
              break;
            }
          }
@@ -717,7 +515,7 @@ TEST(relu6_image2d_fp16, compute) {
          LOG(INFO) << "free: unmap x, y";
          TargetWrapperCL::Unmap(x_data, mapped_x);
          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef RELU6_FP16_LOOP_TEST
+#ifdef SIGMOID_FP16_LOOP_TEST
        }  // w
      }    // h
    }      // c
@@ -730,17 +528,15 @@ TEST(relu6_image2d_fp16, compute) {
 }  // namespace lite
 }  // namespace paddle
-// relu buffer
+// layout
-// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
-// relu image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
 // relu image2d fp16
 USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
-// relu6 image2d fp32
+// relu6 image2d fp16
-USE_LITE_KERNEL(relu6, kOpenCL, kFloat, kImageDefault, ImageDefault);
 USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
+// sigmoid image2d fp16
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/opencl/concat_buffer_compute.cc
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ConcatCompute : public KernelLite<TARGET(kOpenCL),
+                                        PRECISION(kFP16),
+                                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConcatParam;
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    concat_param_ = param_.get_mutable<param_t>();
+    if (concat_param_->x.size() == 2) {
+      kernel_func_name_ = "concat2";
+    } else {
+      kernel_func_name_ = "concat_mul";
+    }
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+    auto axis = concat_param_->axis;
+    auto inputs = concat_param_->x;
+    auto out_dims = concat_param_->output->dims();
+    auto* axis_tensor = concat_param_->axis_tensor;
+    if (axis_tensor != nullptr) {
+      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+      // axis = axis_tensor_data[0];
+    }
+    auto in_dims = inputs[0]->dims();
+    axis_size_ = out_dims[axis];
+    axis_ = axis;
+    for (int i = 0; i < axis; i++) {
+      pre_size_ *= in_dims[i];
+    }
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      post_size_ *= in_dims[i];
+    }
+    for (int i = 1; i < inputs.size(); i++) {
+      auto dims = inputs[i]->dims();
+      if (in_dims.size() != dims.size()) {
+        printf("input shape must be same \n");
+        return;
+      }
+      for (int i = 0; i < dims.size(); i++) {
+        if (i != axis) {
+          if (in_dims[i] != dims[i]) {
+            printf("input shape must be same \n");
+            return;
+          }
+        }
+      }
+    }
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.output->dims();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    const auto& y_dims = param.output->dims();  // useless: check dim only
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto inputs = param.x;
+    int arg_idx = 0;
+    auto global_work_size = cl::NDRange{axis_size_};
+    int total = axis_size_ * post_size_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    if (inputs.size() == 2) {
+      auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
+      auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
+      auto axis0 = inputs[0]->dims()[axis_];
+      int total0 = axis0 * post_size_;
+      int total1 = (axis_size_ - axis0) * post_size_;
+      cl_int status = kernel.setArg(arg_idx, *x_buf0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_buf1);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, axis_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, pre_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, post_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total1);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+    } else {
+      auto start = 0;
+      for (int i = 0; i < inputs.size(); i++) {
+        arg_idx = 0;
+        int size = inputs[i]->dims()[axis_];
+        auto* x_buf = inputs[i]->data<float, cl::Buffer>();
+        global_work_size = cl::NDRange{static_cast<size_t>(size)};
+        int total0 = size * post_size_;
+        cl_int status = kernel.setArg(arg_idx, *x_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, static_cast<int>(size));
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, pre_size_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, post_size_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, start);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, total);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, total0);
+        CL_CHECK_FATAL(status);
+        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+            kernel,
+            cl::NullRange,
+            global_work_size,
+            cl::NullRange,
+            nullptr,
+            event_.get());
+        CL_CHECK_FATAL(status);
+        context.cl_wait_list()->emplace(out_buf, event_);
+        start += size;
+      }
+    }
+  }
+  std::string doc() { return "Concat using cl::Buffer, kFloat"; }
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+typedef paddle::lite::kernels::opencl::ConcatCompute Concat_buffer;
+REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
--- a/lite/kernels/opencl/concat_buffer_compute_test.cc
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// //
+// // Licensed under the Apache License, Version 2.0 (the "License");
+// // you may not use this file except in compliance with the License.
+// // You may obtain a copy of the License at
+// //
+// //     http://www.apache.org/licenses/LICENSE-2.0
+// //
+// // Unless required by applicable law or agreed to in writing, software
+// // distributed under the License is distributed on an "AS IS" BASIS,
+// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// // See the License for the specific language governing permissions and
+// // limitations under the License.
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+namespace paddle {
+namespace lite {
+template <typename dtype>
+void concat2_compute_ref(const dtype *in0,
+                         const dtype *in1,
+                         const int axis,
+                         const DDim in0_dim,
+                         const DDim in1_dim,
+                         const DDim out_dim,
+                         dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= in0_dim[i];
+  }
+  for (int i = axis + 1; i < in0_dim.size(); i++) {
+    post_size *= in0_dim[i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < axis_size; j++) {
+      if (j < in0_dim[axis]) {
+        memcpy(out_data, in0, sizeof(dtype) * post_size);
+        in0 += post_size;
+        out_data += post_size;
+      }
+    }
+  }
+}
+template <typename dtype>
+void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
+                            std::vector<const DDim> ins_dim,
+                            int axis,
+                            const DDim out_dim,
+                            dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= ins_dim[0][i];
+  }
+  for (int i = axis + 1; i < ins_dim[0].size(); i++) {
+    post_size *= ins_dim[0][i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < ins_data.size(); j++) {
+      int size = post_size * ins_dim[j][axis];
+      memcpy(out_data, ins_data[j], sizeof(dtype) * size);
+      out_data += size;
+    }
+  }
+}
+TEST(opencl_concat_buffer, compute) {
+  // prepare data
+  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
+  lite::Tensor x0, x1, x2, out, out_ref;
+  x0.Resize(x0_dim);
+  x1.Resize(x1_dim);
+  x2.Resize(x2_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x0 = static_cast<float *>(
+      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
+  auto *mapped_x1 = static_cast<float *>(
+      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
+  auto *mapped_x2 = static_cast<float *>(
+      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
+  for (int i = 0; i < x0_dim.production(); i++) {
+    mapped_x0[i] = dist(engine);
+  }
+  for (int i = 0; i < x1_dim.production(); i++) {
+    mapped_x1[i] = dist(engine);
+  }
+  for (int i = 0; i < x2_dim.production(); i++) {
+    mapped_x2[i] = dist(engine);
+  }
+  // set param and kernel, then run
+  operators::ConcatParam param;
+  std::vector<lite::Tensor *> ins;
+  ins.push_back(&x0);
+  ins.push_back(&x1);
+  ins.push_back(&x2);
+  auto axis = 1;
+  param.x = ins;
+  param.output = &out;
+  param.axis = axis;
+  std::vector<const float *> ins_data;
+  std::vector<const DDim> ins_dim;
+  ins_data.push_back(mapped_x0);
+  ins_data.push_back(mapped_x1);
+  ins_data.push_back(mapped_x2);
+  ins_dim.push_back(x0_dim);
+  ins_dim.push_back(x1_dim);
+  ins_dim.push_back(x2_dim);
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> concat_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(concat_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(concat_context));
+  kernel->Launch();
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+  // run compute ref and check
+  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x0_data, mapped_x0);
+  TargetWrapperCL::Unmap(x1_data, mapped_x1);
+  TargetWrapperCL::Unmap(x2_data, mapped_x2);
+}
+}  // namespace lite
+}  // namespace paddle
+// concat buffer
+USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/concat_compute.cc
+++ b/lite/kernels/opencl/concat_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/opencl/concat_compute.h"
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-template <>
-void ConcatCompute<PRECISION(kFloat),
-                   DATALAYOUT(kImageDefault)>::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  concat_param_ = param_.get_mutable<param_t>();
-  if (concat_param_->x.size() == 2) {
-    kernel_func_name_ = "concat2";
-  } else {
-    kernel_func_name_ = "concat_mul";
-  }
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/concat_kernel.cl", build_options_);
-  // UpdateParams<kFloat, kImageDefault>();
-  auto axis = concat_param_->axis;
-  auto inputs = concat_param_->x;
-  auto out_dims = concat_param_->output->dims();
-  auto* axis_tensor = concat_param_->axis_tensor;
-  if (axis_tensor != nullptr) {
-    // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
-    // axis = axis_tensor_data[0];
-  }
-  auto in_dims = inputs[0]->dims();
-  axis_size_ = out_dims[axis];
-  axis_ = axis;
-  for (int i = 0; i < axis; i++) {
-    pre_size_ *= in_dims[i];
-  }
-  for (int i = axis + 1; i < in_dims.size(); i++) {
-    post_size_ *= in_dims[i];
-  }
-  for (int i = 1; i < inputs.size(); i++) {
-    auto dims = inputs[i]->dims();
-    // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
-    if (in_dims.size() != dims.size()) {
-      printf("input shape must be same \n");
-      return;
-    }
-    for (int i = 0; i < dims.size(); i++) {
-      if (i != axis) {
-        if (in_dims[i] != dims[i]) {
-          printf("input shape must be same \n");
-          return;
-        }
-      }
-    }
-  }
-}
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
-  auto& param = *param_.get_mutable<param_t>();
-  const auto& x_dims = param.output->dims();
-  auto image_shape = InitImageDimInfoWith(x_dims);
-  auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
-  const auto& y_dims = param.output->dims();  // useless: check dim only
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto inputs = param.x;
-  int arg_idx = 0;
-  int width = inputs[0]->dims()[-1];
-  auto global_work_size =
-      cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                  static_cast<cl::size_type>(image_shape["height"])};
-  VLOG(4) << TargetToStr(param.output->target());
-  VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-          << image_shape["height"];
-  VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-          << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-  VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-          << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int flag = 1;  // cxw
-  switch (axis_) {
-    case 0:
-      width = x_dims[2];  // n
-      flag = 0;
-      break;
-    case 1:
-      width = x_dims[3];  // c
-      break;
-    case 2:
-      width = x_dims[0];  // h
-      flag = 0;
-      break;
-    case 3:
-    case -1:
-      width = x_dims[1];  // w
-      break;
-    default:
-      printf("this axis: %d does not support \n", axis_);
-  }
-  if (inputs.size() == 2) {
-    auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
-    auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
-    cl_int status = kernel.setArg(arg_idx, *x_buf0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *x_buf1);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status =
-        kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, flag);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, width);
-    CL_CHECK_FATAL(status);
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_context()->GetCommandQueue().finish();
-  } else {
-    auto start = 0;
-    for (int i = 0; i < inputs.size(); i++) {
-      arg_idx = 0;
-      auto* x_buf = inputs[i]->data<float, cl::Image2D>();
-      cl_int status = kernel.setArg(arg_idx, *x_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, axis_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, start);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, flag);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, width);
-      CL_CHECK_FATAL(status);
-      CL_CHECK_FATAL(status);
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_context()->GetCommandQueue().finish();
-      start += inputs[i]->dims()[axis_];
-    }
-  }
-}
-template <>
-std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
-  return "Concat using cl::Image, kFloat";
-}
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  concat_param_ = param_.get_mutable<param_t>();
-  if (concat_param_->x.size() == 2) {
-    kernel_func_name_ = "concat2";
-  } else {
-    kernel_func_name_ = "concat_mul";
-  }
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
-  //  UpdateParams<kFloat, kImageDefault>();
-  auto axis = concat_param_->axis;
-  auto inputs = concat_param_->x;
-  auto out_dims = concat_param_->output->dims();
-  auto* axis_tensor = concat_param_->axis_tensor;
-  if (axis_tensor != nullptr) {
-    //   auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
-    //  axis = axis_tensor_data[0];
-  }
-  auto in_dims = inputs[0]->dims();
-  axis_size_ = out_dims[axis];
-  axis_ = axis;
-  for (int i = 0; i < axis; i++) {
-    pre_size_ *= in_dims[i];
-  }
-  for (int i = axis + 1; i < in_dims.size(); i++) {
-    post_size_ *= in_dims[i];
-  }
-  for (int i = 1; i < inputs.size(); i++) {
-    auto dims = inputs[i]->dims();
-    if (in_dims.size() != dims.size()) {
-      printf("input shape must be same \n");
-      return;
-    }
-    for (int i = 0; i < dims.size(); i++) {
-      if (i != axis) {
-        if (in_dims[i] != dims[i]) {
-          printf("input shape must be same \n");
-          return;
-        }
-      }
-    }
-  }
-}
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
-  auto& param = *param_.get_mutable<param_t>();
-  const auto& x_dims = param.output->dims();
-  auto image_shape = InitImageDimInfoWith(x_dims);
-  auto* out_buf =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  const auto& y_dims = param.output->dims();  // useless: check dim only
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto inputs = param.x;
-  int arg_idx = 0;
-  auto global_work_size = cl::NDRange{axis_size_};
-  int total = axis_size_ * post_size_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  if (inputs.size() == 2) {
-    auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
-    auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
-    auto axis0 = inputs[0]->dims()[axis_];
-    int total0 = axis0 * post_size_;
-    int total1 = (axis_size_ - axis0) * post_size_;
-    cl_int status = kernel.setArg(arg_idx, *x_buf0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *x_buf1);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, axis_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, pre_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, post_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total1);
-    CL_CHECK_FATAL(status);
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  } else {
-    auto start = 0;
-    for (int i = 0; i < inputs.size(); i++) {
-      arg_idx = 0;
-      int size = inputs[i]->dims()[axis_];
-      auto* x_buf = inputs[i]->data<float, cl::Buffer>();
-      global_work_size = cl::NDRange{static_cast<size_t>(size)};
-      int total0 = size * post_size_;
-      cl_int status = kernel.setArg(arg_idx, *x_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<int>(size));
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, pre_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, post_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, start);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, total);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, total0);
-      CL_CHECK_FATAL(status);
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
-      start += size;
-    }
-  }
-}
-template <>
-std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
-  return "Concat using cl::Buffer, kFloat";
-}
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
-                                                     DATALAYOUT(kNCHW)>
-    Concat_buffer;
-typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
-                                                     DATALAYOUT(kImageDefault)>
-    Concat_image;
-REGISTER_LITE_KERNEL(
-    concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("AxisTensor",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
-//     .BindInput("X",
-//                {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                       PRECISION(kFloat),
-//                                       DATALAYOUT(kNCHW))})
-//     .BindInput("AxisTensor",
-//                {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                       PRECISION(kInt32),
-//                                       DATALAYOUT(kNCHW))})
-//     .BindOutput("Out",
-//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                        PRECISION(kFloat),
-//                                        DATALAYOUT(kNCHW))})
-//     .Finalize();
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
+                                             PRECISION(kFP16),
+                                             DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConcatParam;
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    concat_param_ = param_.get_mutable<param_t>();
+    if (concat_param_->x.size() == 2) {
+      kernel_func_name_ = "concat2";
+    } else {
+      kernel_func_name_ = "concat_mul";
+    }
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/concat_kernel.cl", build_options_);
+    auto axis = concat_param_->axis;
+    auto inputs = concat_param_->x;
+    auto out_dims = concat_param_->output->dims();
+    auto* axis_tensor = concat_param_->axis_tensor;
+    if (axis_tensor != nullptr) {
+      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+      // axis = axis_tensor_data[0];
+    }
+    auto in_dims = inputs[0]->dims();
+    axis_size_ = out_dims[axis];
+    axis_ = axis;
+    for (int i = 0; i < axis; i++) {
+      pre_size_ *= in_dims[i];
+    }
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      post_size_ *= in_dims[i];
+    }
+    for (int i = 1; i < inputs.size(); i++) {
+      auto dims = inputs[i]->dims();
+      // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
+      if (in_dims.size() != dims.size()) {
+        printf("input shape must be same \n");
+        return;
+      }
+      for (int i = 0; i < dims.size(); i++) {
+        if (i != axis) {
+          if (in_dims[i] != dims[i]) {
+            printf("input shape must be same \n");
+            return;
+          }
+        }
+      }
+    }
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.output->dims();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.output->mutable_data<uint16_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.output->dims();  // useless: check dim only
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto inputs = param.x;
+    int arg_idx = 0;
+    int width = inputs[0]->dims()[-1];
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    VLOG(4) << TargetToStr(param.output->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int flag = 1;  // cxw
+    switch (axis_) {
+      case 0:
+        width = x_dims[2];  // n
+        flag = 0;
+        break;
+      case 1:
+        width = x_dims[3];  // c
+        break;
+      case 2:
+        width = x_dims[0];  // h
+        flag = 0;
+        break;
+      case 3:
+      case -1:
+        width = x_dims[1];  // w
+        break;
+      default:
+        printf("this axis: %d does not support \n", axis_);
+    }
+    if (inputs.size() == 2) {
+      auto* x_buf0 = inputs[0]->data<uint16_t, cl::Image2D>();
+      auto* x_buf1 = inputs[1]->data<uint16_t, cl::Image2D>();
+      cl_int status = kernel.setArg(arg_idx, *x_buf0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_buf1);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status =
+          kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, flag);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, width);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_context()->GetCommandQueue().finish();
+    } else {
+      auto start = 0;
+      for (int i = 0; i < inputs.size(); i++) {
+        arg_idx = 0;
+        auto* x_buf = inputs[i]->data<uint16_t, cl::Image2D>();
+        cl_int status = kernel.setArg(arg_idx, *x_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, axis_size_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, start);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, flag);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, width);
+        CL_CHECK_FATAL(status);
+        CL_CHECK_FATAL(status);
+        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+            kernel,
+            cl::NullRange,
+            global_work_size,
+            cl::NullRange,
+            nullptr,
+            event_.get());
+        CL_CHECK_FATAL(status);
+        context.cl_context()->GetCommandQueue().finish();
+        start += inputs[i]->dims()[axis_];
+      }
+    }
+  }
+  std::string doc() { return "Concat using cl::Image, kFP16"; }
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+typedef paddle::lite::kernels::opencl::ConcatComputeImage Concat_image;
+REGISTER_LITE_KERNEL(
+    concat, kOpenCL, kFP16, kImageDefault, Concat_image, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+#define FP16_MAX_DIFF (5e-1)
 namespace paddle {
 namespace lite {
@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
    }
  }
 }
-#if 0   // concat_buffer
-TEST(opencl_concat_buffer, compute) {
-  // prepare data
-  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
-  lite::Tensor x0, x1, x2, out, out_ref;
-  x0.Resize(x0_dim);
-  x1.Resize(x1_dim);
-  x2.Resize(x2_dim);
-  out.Resize(out_dim);
-  out_ref.Resize(out_dim);
-  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x0 = static_cast<float *>(
-      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
-  auto *mapped_x1 = static_cast<float *>(
-      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
-  auto *mapped_x2 = static_cast<float *>(
-      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
-  for (int i = 0; i < x0_dim.production(); i++) {
-    mapped_x0[i] = dist(engine);
-  }
-  for (int i = 0; i < x1_dim.production(); i++) {
-    mapped_x1[i] = dist(engine);
-  }
-  for (int i = 0; i < x2_dim.production(); i++) {
-    mapped_x2[i] = dist(engine);
-  }
-  // set param and kernel, then run
-  operators::ConcatParam param;
-  std::vector<lite::Tensor *> ins;
-  ins.push_back(&x0);
-  ins.push_back(&x1);
-  ins.push_back(&x2);
-  auto axis = 1;
-  param.x = ins;
-  param.output = &out;
-  param.axis = axis;
-  std::vector<const float *> ins_data;
-  std::vector<const DDim> ins_dim;
-  ins_data.push_back(mapped_x0);
-  ins_data.push_back(mapped_x1);
-  ins_data.push_back(mapped_x2);
-  ins_dim.push_back(x0_dim);
-  ins_dim.push_back(x1_dim);
-  ins_dim.push_back(x2_dim);
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> concat_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(concat_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(concat_context));
-  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-  // run compute ref and check
-  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x0_data, mapped_x0);
-  TargetWrapperCL::Unmap(x1_data, mapped_x1);
-  TargetWrapperCL::Unmap(x2_data, mapped_x2);
-}
-#endif  // concat_buffer
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(concat_image2d_fp32, compute) {
+TEST(concat_image2d, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
               "layout(img2buf) "
               "-> host";
@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) {
            auto concat_img_kernels =
                KernelRegistry::Global().Create("concat",
                                                TARGET(kOpenCL),
-                                                PRECISION(kFloat),
+                                                PRECISION(kFP16),
                                                DATALAYOUT(kImageDefault));
            ASSERT_FALSE(buf_to_img_kernels.empty());
            ASSERT_FALSE(buf_to_img_kernels1.empty());
@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) {
            for (int i = 0; i < out_dim.production(); ++i) {
              mapped_y[i] = static_cast<int>(0);
            }
-            auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>(
+            auto *concat_in_data0 =
+                concat_in0.mutable_data<uint16_t, cl::Image2D>(
                    concat_image2d_shape_in0["width"],
                    concat_image2d_shape_in0["height"]);
-            auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>(
+            auto *concat_in_data1 =
+                concat_in1.mutable_data<uint16_t, cl::Image2D>(
                    concat_image2d_shape_in1["width"],
                    concat_image2d_shape_in1["height"]);
-            auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>(
+            auto *concat_out_data =
-                concat_image2d_shape["width"], concat_image2d_shape["height"]);
+                concat_out.mutable_data<uint16_t, cl::Image2D>(
+                    concat_image2d_shape["width"],
+                    concat_image2d_shape["height"]);
            // set context and kernel args
            LOG(INFO) << "set context and kernel args";
@@ -347,22 +258,35 @@ TEST(concat_image2d_fp32, compute) {
 #ifdef PRINT_RESULT
            LOG(INFO) << "---- print kernel result (input -> output) ----";
            for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
-              std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> "
+              std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
-                        << mapped_y[eidx] << std::endl;
+                        << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
+                        << "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
+                        << "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
+                        << IS_DIFF_PASSED(
+                               y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
+                        << std::endl;
            }
 #endif  // PRINT_RESULT
            // check result: compare kernel output and cpu output(y_data_ref)
-            for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+            for (int i = 0; i < out_dim.production(); i++) {
-              EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+              auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
-              if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              auto relative_diff =
-                LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                  COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
-                          << " / " << x0_dim.production() << ", y_data_ref["
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
-                          << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                            (abs_diff <= FP16_MAX_DIFF),
-                          << eidx << "]:" << mapped_y[eidx];
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
+                           << "]:" << mapped_y[i] << " y_data_ref[" << i
+                           << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
+                           << " relative_diff:" << relative_diff
+                           << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
                break;
              }
            }
            // free
            LOG(INFO) << "free: unmap x, y";
            TargetWrapperCL::Unmap(x_data0, mapped_x0);
@@ -382,9 +306,9 @@ TEST(concat_image2d_fp32, compute) {
 }  // namespace paddle
 // concat buffer
-// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(concat, kOpenCL, kFP16, kNCHW, def);
 // concat image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(concat, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/opencl/conv_compute.h"
+#include "lite/kernels/opencl/conv_buffer_compute.h"
 #include <sstream>
@@ -1431,50 +1431,14 @@ void ConvImageCompute::Run() { (this->*impl_)(); }
 }  // namespace lite
 }  // namespace paddle
-// REGISTER_LITE_KERNEL(conv2d,
-//                      kOpenCL,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::ConvCompute,
-//                      def)
-//     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
 REGISTER_LITE_KERNEL(conv2d,
                     kOpenCL,
                     kFloat,
-                     kImageDefault,
+                     kNCHW,
-                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     paddle::lite::kernels::opencl::ConvCompute,
-                     image2d)
+                     def)
-    .BindInput("Input",
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-                                      PRECISION(kFloat),
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-REGISTER_LITE_KERNEL(depthwise_conv2d,
-                     kOpenCL,
-                     kFloat,
-                     kImageDefault,
-                     paddle::lite::kernels::opencl::ConvImageCompute,
-                     image2d)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -58,34 +58,6 @@ class ConvCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
-class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
-                                           PRECISION(kFloat),
-                                           DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-  using kernel_t = void (ConvImageCompute::*)();
-  void PrepareForRun() override;
-  void Run() override;
- private:
-  void Conv2d1x1();
-  void Conv2d3x3();
-  void Conv2d5x5();
-  void Conv2d7x7();
-  void DepthwiseConv2d3x3s1();
-  void DepthwiseConv2d3x3();
-  void DepthwiseConv2d();
-  kernel_t impl_;
-  std::vector<std::string> kernel_func_names_{};
-  std::vector<std::string> kernel_func_paths_{};
-  std::vector<std::string> build_options_{};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-  Tensor filter_gpu_image_;
-  Tensor bias_gpu_image_;
-};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -167,7 +167,6 @@ void PrintData(std::string name,
 }
 // buffer
-#if 0
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(conv2d, compute_conv2d_1x1) {
@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) {
  }              // batch_size
 #endif
 }
-#endif
 }  // namespace lite
 }  // namespace paddle
-// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/opencl/conv_image_compute.h"
+#include <sstream>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+/* image kernel*/
+void ConvImageCompute::PrepareForRun() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  float* filter_cpu = param.filter->mutable_data<float>();
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];  // oihw
+  int kernel_w = filter_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int groups = param.groups;
+  bool relu_fused = param.fuse_relu;
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool zero_pad = (pad_h == 0) && (pad_w == 0);
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
+       (paddings[2] == paddings[3]));
+  bool stride_equal = stride_h == stride_w;
+  bool dilation_equal = dilations[0] == dilations[1];
+  CHECK(pad_equal && stride_equal && dilation_equal);
+  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
+          << " stride_w:" << stride_w << " pad_h:" << pad_h
+          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
+          << " kernel_h:" << kernel_h;
+  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+          << " " << x_dims[3];
+  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
+          << output_dims[2] << " " << output_dims[3];
+  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
+          << filter_dims[2] << " " << filter_dims[3];
+  if (kernel_h == 1 && kernel_w == 1) {
+    // conv2d_1x1
+    if (param.x->dims()[1] % 4 == 0) {
+      kernel_func_names_.push_back("conv2d_1x1_simple");
+    } else {
+      kernel_func_names_.push_back("conv2d_1x1");
+    }
+    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d1x1;
+#if 1  // TODO(ysh329): enable general dwconv
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
+#else  // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+    // depth_conv2d_3x3s1, depth_conv2d_3x3
+    if (stride_h == 1 && dilations[0] == 1) {
+      kernel_func_names_.push_back("depth_conv2d_3x3s1");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+    } else {
+      kernel_func_names_.push_back("depth_conv2d_3x3");
+      impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+    }
+    kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
+             kernel_h != 3) {
+#endif
+    // depth_conv2d
+    kernel_func_names_.push_back("depth_conv2d");
+    kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::DepthwiseConv2d;
+  } else if (kernel_h == 3 && kernel_h == 3) {
+    // conv2d_3x3
+    kernel_func_names_.push_back("conv2d_3x3");
+    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d3x3;
+  } else if (kernel_h == 5 && kernel_w == 5) {
+    // conv2d_5x5
+    kernel_func_names_.push_back("conv2d_5x5");
+    kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d5x5;
+  } else if (kernel_h == 7 && kernel_w == 7) {
+    // conv2d_7x7
+    kernel_func_names_.push_back("conv2d_7x7");
+    kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<uint16_t> filter_image_v(filter_image_dims[0] *
+                                         filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d7x7;
+  } else {
+    LOG(FATAL) << "conv image compute not support this condition yet! ";
+  }
+  VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
+          << " kernel_func_paths_[0]:" << kernel_func_paths_[0];
+  std::string build_options_single(" -DCL_DTYPE_half");
+  // relu options
+  if (relu_fused) {
+    build_options_single += " -DRELU";
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kRelu6) {
+    build_options_single += " -DRELU6";
+  } else {
+    // do nothing, may add more activation fuse
+  }
+  // bias options
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  if (has_bias) {
+    build_options_single +=
+        is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
+    // convert cpu buffer bias --> gpu image
+    CLImageConverterFolder bias_converter;
+    const DDim& bias_image_dims =
+        bias_converter.InitImageDimInfoWith(param.bias->dims());
+    std::vector<uint16_t> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
+                                       4);
+    float* bias_cpu_data = param.bias->mutable_data<float>();
+    bias_converter.NCHWToImage(
+        bias_cpu_data, bias_image_v.data(), param.bias->dims());
+    this->bias_gpu_image_.mutable_data<uint16_t, cl::Image2D>(
+        bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
+    // convert cpu buffer bias --> gpu image --- end ----
+  }
+  build_options_.push_back(build_options_single);
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+}
+void ConvImageCompute::Conv2d1x1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d_1x1 params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int maped_w = maptofactor(w, 4);
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "maped_w: " << maped_w;
+  VLOG(4) << "hasbias: " << has_bias;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, maped_w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(maped_w),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+  /* TODO(ysh329): mobile has no case below
+     else {
+      LOG(FATAL) << "Not support conv3x3 case with"
+                 << " input_dims:" << input_dims << " output_dims:" <<
+    output_dims
+                 << " filter_dims:" << filter_dims;
+    }
+  */
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d5x5() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d7x7() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto image_shape = InitImageDimInfoWith(output_dims);
+  auto* output_img = param.output->mutable_data<uint16_t, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+  int w_blk_size = 2;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+void ConvImageCompute::DepthwiseConv2d3x3() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+  int offset = filter_dims[2] / 2 - paddings[0];
+  int input_c_block = (x_dims[1] + 3) / 4;
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* input_img = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  const cl::Image2D* bias_img = nullptr;
+  if (param.bias) {
+    bias_img = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto image_shape = InitImageDimInfoWith(output_dims);
+  auto* output_img = param.output->mutable_data<uint16_t, cl::Image2D>(
+      image_shape["width"], image_shape["height"]);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int c_block = (output_dims[1] + 3) / 4;
+  int w = output_dims[3];
+  int nh = output_dims[0] * output_dims[2];
+  auto global_work_size = cl::NDRange(c_block, w, nh);
+  VLOG(4) << "setArg";
+  VLOG(4) << "c_block = " << c_block;
+  VLOG(4) << "w = " << w;
+  VLOG(4) << "nh = " << nh;
+  VLOG(4) << "strides = " << strides[0];
+  VLOG(4) << "offset = " << offset;
+  VLOG(4) << "dilations = " << dilations[0];
+  VLOG(4) << "input_c_block = " << input_c_block;
+  VLOG(4) << "x_dims[3] = " << x_dims[3];
+  VLOG(4) << "x_dims[2] = " << x_dims[2];
+  VLOG(4) << "output_dims[3] = " << output_dims[3];
+  VLOG(4) << "output_dims[2] = " << output_dims[2];
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_img);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+  CL_CHECK_FATAL(status);
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(output_img, event_);
+}
+void ConvImageCompute::DepthwiseConv2d() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<uint16_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<uint16_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<uint16_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ depthwise conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<uint16_t, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Run() { (this->*impl_)(); }
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(conv2d,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+REGISTER_LITE_KERNEL(depthwise_conv2d,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/concat_compute.h
+++ b/lite/kernels/opencl/concat_compute.h
@@ -11,41 +11,50 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <memory>
 #include <string>
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
+#include "lite/core/tensor.h"
 #include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-template <PrecisionType Ptype, DataLayoutType layout>
+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
-class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
+                                           PRECISION(kFP16),
+                                           DATALAYOUT(kImageDefault)> {
 public:
-  using param_t = operators::ConcatParam;
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
  void PrepareForRun() override;
  void Run() override;
-  std::string doc();  // override;
+ private:
+  void Conv2d1x1();
-  // protected:
+  void Conv2d3x3();
-  // void UpdateParams();
+  void Conv2d5x5();
+  void Conv2d7x7();
-  int axis_size_ = 1;
+  void DepthwiseConv2d3x3s1();
-  int post_size_ = 1;
+  void DepthwiseConv2d3x3();
-  int pre_size_ = 1;
+  void DepthwiseConv2d();
-  int axis_ = 1;
-  param_t* concat_param_{nullptr};
+  kernel_t impl_;
-  std::string kernel_func_name_{};
+  std::vector<std::string> kernel_func_names_{};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
 };
 }  // namespace opencl

--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
@@ -15,16 +15,17 @@
 #include <gtest/gtest.h>
 #include <random>
 #include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
 namespace paddle {
 namespace lite {
 // #define SHADOW_LOG LOG(INFO)
 #define SHADOW_LOG VLOG(4)
+#define FP16_MAX_DIFF (1e0)
 template <typename Dtype1, typename Dtype2>
 static void conv_basic(const Dtype1* din,
@@ -162,7 +163,7 @@ TEST(conv2d, compute_image2d_1x1) {
                auto kernels =
                    KernelRegistry::Global().Create("conv2d",
                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
+                                                    PRECISION(kFP16),
                                                    DATALAYOUT(kImageDefault));
                ASSERT_FALSE(kernels.empty());
@@ -283,13 +284,13 @@ TEST(conv2d, compute_image2d_1x1) {
                paddle::lite::CLImageConverterDefault default_convertor;
                SHADOW_LOG << "set mapped input  ...";
-                std::vector<float> x_image_v(
+                std::vector<uint16_t> x_image_v(
                    input_image_width * input_image_height * 4);  // 4 : RGBA
-                std::vector<float> filter_image_v(
+                std::vector<uint16_t> filter_image_v(
                    filter_image_width * filter_image_height * 4);  // 4 :RGBA
-                std::vector<float> bias_image_v(
+                std::vector<uint16_t> bias_image_v(
                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
-                std::vector<float> out_image_v(
+                std::vector<uint16_t> out_image_v(
                    out_image_width * out_image_height * 4);  // 4 : RGBA
                default_convertor.NCHWToImage(
@@ -300,13 +301,13 @@ TEST(conv2d, compute_image2d_1x1) {
                nw_convertor.NCHWToImage(
                    filter_v.data(), filter_image_v.data(), filter_dim);
-                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
                    input_image_width, input_image_height, x_image_v.data());
                // assign filter as target arm
                filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                               filter_dim);
                //                auto* filter_image2d =
-                //                filter.mutable_data<float, cl::Image2D>(
+                //                filter.mutable_data<uint16_t, cl::Image2D>(
                //                    filter_image_width,
                //                    filter_image_height,
                //                    filter_image_v.data());
@@ -356,11 +357,12 @@ TEST(conv2d, compute_image2d_1x1) {
                SHADOW_LOG << "kernel launch ...";
                kernel->Launch();
                SHADOW_LOG << "mutable output ...";
-                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                auto* output_image2d =
+                    output.mutable_data<uint16_t, cl::Image2D>(
                        out_image_width, out_image_height);
                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Image2D>();
+                auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
                auto it = wait_list->find(out_ptr);
                if (it != wait_list->end()) {
@@ -373,8 +375,9 @@ TEST(conv2d, compute_image2d_1x1) {
                                "cl tensor.";
                }
-                TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                TargetWrapperCL::ImgcpySync(
-                                            output.data<float, cl::Image2D>(),
+                    out_image_v.data(),
+                    output.data<uint16_t, cl::Image2D>(),
                    out_image_width,
                    out_image_height,
                    cl_image2d_row_pitch,
@@ -425,12 +428,16 @@ TEST(conv2d, compute_image2d_1x1) {
                     static_cast<int64_t>(out_image_height)})};
                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                  auto relative_diff =
-                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                      COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                    LOG(FATAL) << "error idx:" << i;
+                  EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                  if (relative_diff > FP16_MAX_DIFF) {
+                    LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                               << "]:" << output_v[i] << " "
+                                                         "out_ref_data["
+                               << i << "]:" << out_ref_data[i];
                  }
                }
 #ifdef LOOP_TEST
              }
            }
@@ -479,12 +486,12 @@ TEST(conv2d, compute_image2d_3x3) {
                const int oc = 2;
 #else  // big scale with group
                const int stride = 1;
-                const int group = 32;
+                const int group = 32 / 1;
                const int batch_size = 1;
-                const int ic = 32;
+                const int ic = 32 / 1;
-                const int ih = 112;
+                const int ih = 112 / 1;
-                const int iw = 112;
+                const int iw = 112 / 1;
-                const int oc = 32;
+                const int oc = 32 / 1;
 #endif
                const bool bias_flag = false;
@@ -503,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
              auto kernels =
                  KernelRegistry::Global().Create("conv2d",
                                                  TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(kernels.empty());
              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
@@ -599,10 +606,10 @@ TEST(conv2d, compute_image2d_3x3) {
              SHADOW_LOG << "gen input and filter ...";
              for (int i = 0; i < input_v.size(); ++i) {
-                input_v[i] = i;  // gen(engine);
+                input_v[i] = i * 0.001;  // gen(engine);
              }
              for (int i = 0; i < filter_v.size(); ++i) {
-                filter_v[i] = 1;  // gen(engine);
+                filter_v[i] = 1 * 0.001;  // gen(engine);
              }
              SHADOW_LOG << "after gen input and filter ...";
@@ -634,14 +641,14 @@ TEST(conv2d, compute_image2d_3x3) {
              paddle::lite::CLImageConverterDefault default_convertor;
              SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
+              std::vector<uint16_t> x_image_v(
-                                           input_image_height * 4);  // 4 :RGBA
+                  input_image_width * input_image_height * 4);  // 4 :RGBA
-              std::vector<float> filter_image_v(
+              std::vector<uint16_t> filter_image_v(
                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<uint16_t> bias_image_v(
                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
+              std::vector<uint16_t> out_image_v(
-                                             out_image_height * 4);  // 4 :RGBA
+                  out_image_width * out_image_height * 4);  // 4 :RGBA
              default_convertor.NCHWToImage(
                  input_v.data(), x_image_v.data(), input_dim);
@@ -666,7 +673,7 @@ TEST(conv2d, compute_image2d_3x3) {
              for (int i = 0; i < filter_image_v.size(); i++) {
                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
              }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
                  input_image_width, input_image_height, x_image_v.data());
              // assign filter as target arm
              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
@@ -707,11 +714,11 @@ TEST(conv2d, compute_image2d_3x3) {
              SHADOW_LOG << "kernel launch ...";
              kernel->Launch();
              SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
                  out_image_width, out_image_height);
              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
              auto it = wait_list->find(out_ptr);
              if (it != wait_list->end()) {
@@ -725,7 +732,7 @@ TEST(conv2d, compute_image2d_3x3) {
              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<uint16_t, cl::Image2D>(),
                                          out_image_width,
                                          out_image_height,
                                          cl_image2d_row_pitch,
@@ -793,9 +800,14 @@ TEST(conv2d, compute_image2d_3x3) {
 #endif
              for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                auto relative_diff =
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                  LOG(FATAL) << "error idx:" << i;
+                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                if (relative_diff > FP16_MAX_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                }
              }
@@ -850,7 +862,7 @@ TEST(conv2d, compute_image2d_5x5) {
              auto kernels =
                  KernelRegistry::Global().Create("conv2d",
                                                  TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(kernels.empty());
@@ -944,10 +956,10 @@ TEST(conv2d, compute_image2d_5x5) {
              SHADOW_LOG << "gen input and filter ...";
              for (auto& i : input_v) {
-                i = gen(engine);
+                i = 0.01 * gen(engine);
              }
              for (auto& f : filter_v) {
-                f = gen(engine);
+                f = 0.01 * gen(engine);
              }
              SHADOW_LOG << "after gen input and filter ...";
@@ -975,14 +987,14 @@ TEST(conv2d, compute_image2d_5x5) {
              paddle::lite::CLImageConverterDefault default_convertor;
              SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
+              std::vector<uint16_t> x_image_v(
-                                           input_image_height * 4);  // 4 :RGBA
+                  input_image_width * input_image_height * 4);  // 4 :RGBA
-              std::vector<float> filter_image_v(
+              std::vector<uint16_t> filter_image_v(
                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<uint16_t> bias_image_v(
                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
+              std::vector<uint16_t> out_image_v(
-                                             out_image_height * 4);  // 4 :RGBA
+                  out_image_width * out_image_height * 4);  // 4 :RGBA
              default_convertor.NCHWToImage(
                  input_v.data(), x_image_v.data(), input_dim);
@@ -1007,7 +1019,7 @@ TEST(conv2d, compute_image2d_5x5) {
              for (int i = 0; i < filter_image_v.size(); i++) {
                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
              }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
                  input_image_width, input_image_height, x_image_v.data());
              // assign filter as target arm
              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
@@ -1048,11 +1060,11 @@ TEST(conv2d, compute_image2d_5x5) {
              SHADOW_LOG << "kernel launch ...";
              kernel->Launch();
              SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
                  out_image_width, out_image_height);
              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
              auto it = wait_list->find(out_ptr);
              if (it != wait_list->end()) {
@@ -1066,7 +1078,7 @@ TEST(conv2d, compute_image2d_5x5) {
              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<uint16_t, cl::Image2D>(),
                                          out_image_width,
                                          out_image_height,
                                          cl_image2d_row_pitch,
@@ -1127,9 +1139,14 @@ TEST(conv2d, compute_image2d_5x5) {
                   static_cast<int64_t>(out_image_height)})};
              for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                auto relative_diff =
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                  LOG(FATAL) << "error idx:" << i;
+                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                if (relative_diff > FP16_MAX_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                }
              }
@@ -1183,7 +1200,7 @@ TEST(conv2d, compute_image2d_7x7) {
              auto kernels =
                  KernelRegistry::Global().Create("conv2d",
                                                  TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(kernels.empty());
@@ -1308,14 +1325,14 @@ TEST(conv2d, compute_image2d_7x7) {
              paddle::lite::CLImageConverterDefault default_convertor;
              SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
+              std::vector<uint16_t> x_image_v(
-                                           input_image_height * 4);  // 4 : RGBA
+                  input_image_width * input_image_height * 4);  // 4 : RGBA
-              std::vector<float> filter_image_v(
+              std::vector<uint16_t> filter_image_v(
                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<uint16_t> bias_image_v(
                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
+              std::vector<uint16_t> out_image_v(
-                                             out_image_height * 4);  // 4 : RGBA
+                  out_image_width * out_image_height * 4);  // 4 : RGBA
              default_convertor.NCHWToImage(
                  input_v.data(), x_image_v.data(), input_dim);
@@ -1340,7 +1357,7 @@ TEST(conv2d, compute_image2d_7x7) {
              for (int i = 0; i < filter_image_v.size(); i++) {
                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
              }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
                  input_image_width, input_image_height, x_image_v.data());
              // assign filter as target arm
@@ -1382,11 +1399,11 @@ TEST(conv2d, compute_image2d_7x7) {
              SHADOW_LOG << "kernel launch ...";
              kernel->Launch();
              SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
                  out_image_width, out_image_height);
              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
              auto it = wait_list->find(out_ptr);
              if (it != wait_list->end()) {
@@ -1400,7 +1417,7 @@ TEST(conv2d, compute_image2d_7x7) {
              }
              TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<uint16_t, cl::Image2D>(),
                                          out_image_width,
                                          out_image_height,
                                          cl_image2d_row_pitch,
@@ -1461,9 +1478,14 @@ TEST(conv2d, compute_image2d_7x7) {
                   static_cast<int64_t>(out_image_height)})};
              for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                auto relative_diff =
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
-                  LOG(FATAL) << "error idx:" << i;
+                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                if (relative_diff > FP16_MAX_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                }
              }
@@ -1485,4 +1507,4 @@ TEST(conv2d, compute_image2d_7x7) {
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(conv2d, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
@@ -105,6 +105,7 @@ int ConvOutputSize(int input_size,
  return output_size;
 }
+// #define LOOP_TEST
 TEST(depthwise_conv2d_basic, compute) {
  // conv infos
  //  const int ksize = 1;
@@ -144,7 +145,7 @@ TEST(depthwise_conv2d_basic, compute) {
          auto kernels =
              KernelRegistry::Global().Create("depthwise_conv2d",
                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                              DATALAYOUT(kImageDefault));
          ASSERT_FALSE(kernels.empty());
@@ -252,13 +253,13 @@ TEST(depthwise_conv2d_basic, compute) {
          paddle::lite::CLImageConverterDefault default_convertor;
          VLOG(4) << "set mapped input  ...";
-          std::vector<float> x_image_v(input_image_width * input_image_height *
+          std::vector<uint16_t> x_image_v(input_image_width *
-                                       4);  // 4 : RGBA
+                                          input_image_height * 4);  // 4 : RGBA
-          std::vector<float> filter_image_v(
+          std::vector<uint16_t> filter_image_v(
              filter_image_width * filter_image_height * 4);  // 4 : RGBA
-          std::vector<float> bias_image_v(bias_image_width * bias_image_height *
+          std::vector<uint16_t> bias_image_v(
-                                          4);  // 4 : RGBA
+              bias_image_width * bias_image_height * 4);  // 4 : RGBA
-          std::vector<float> out_image_v(out_image_width * out_image_height *
+          std::vector<uint16_t> out_image_v(out_image_width * out_image_height *
                                            4);  // 4 : RGBA
          default_convertor.NCHWToImage(
@@ -269,9 +270,9 @@ TEST(depthwise_conv2d_basic, compute) {
          nw_convertor.NCHWToImage(
              filter_v.data(), filter_image_v.data(), filter_dim);
-          auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+          auto* input_image2d = input.mutable_data<uint16_t, cl::Image2D>(
              input_image_width, input_image_height, x_image_v.data());
-          auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
+          auto* filter_image2d = filter.mutable_data<uint16_t, cl::Image2D>(
              filter_image_width, filter_image_height, filter_image_v.data());
          if (bias_flag) {
@@ -284,7 +285,7 @@ TEST(depthwise_conv2d_basic, compute) {
            CLImageConverterFolder folder_convertor;
            folder_convertor.NCHWToImage(
                bias_v.data(), bias_image_v.data(), bias_dim);
-            auto* bias_data = bias.mutable_data<float, cl::Image2D>(
+            auto* bias_data = bias.mutable_data<uint16_t, cl::Image2D>(
                bias_image_width, bias_image_height, bias_image_v.data());
          }
@@ -300,11 +301,11 @@ TEST(depthwise_conv2d_basic, compute) {
          VLOG(4) << "kernel launch ...";
          kernel->Launch();
          VLOG(4) << "mutable output ...";
-          auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+          auto* output_image2d = output.mutable_data<uint16_t, cl::Image2D>(
              out_image_width, out_image_height);
          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = param.output->data<float, cl::Image2D>();
+          auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
          auto it = wait_list->find(out_ptr);
          if (it != wait_list->end()) {
@@ -318,7 +319,7 @@ TEST(depthwise_conv2d_basic, compute) {
          }
          TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                      output.data<float, cl::Image2D>(),
+                                      output.data<uint16_t, cl::Image2D>(),
                                      out_image_width,
                                      out_image_height,
                                      cl_image2d_row_pitch,
@@ -387,7 +388,7 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
                                                 TARGET(kOpenCL),
-                                                 PRECISION(kFloat),
+                                                 PRECISION(kFP16),
                                                 DATALAYOUT(kImageDefault));
  ASSERT_FALSE(kernels.empty());
@@ -433,11 +434,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
      default_converter->InitImageDimInfoWith(input.dims());
  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
            << input_image_shape[1];
-  std::vector<float> input_image_data(input_image_shape.production() *
+  std::vector<uint16_t> input_image_data(input_image_shape.production() *
                                         4);  // 4 : RGBA
  default_converter->NCHWToImage(
      input_v.data(), input_image_data.data(), input.dims());
-  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
+  auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
      input_image_shape[0], input_image_shape[1], input_image_data.data());
  LOG(INFO) << "prepare kernel";
@@ -446,11 +447,11 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
            << filter_image_shape[1];
-  std::vector<float> filter_image_data(filter_image_shape.production() *
+  std::vector<uint16_t> filter_image_data(filter_image_shape.production() *
                                          4);  // 4 : RGBA
  nw_converter->NCHWToImage(
      filter_v.data(), filter_image_data.data(), filter.dims());
-  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
+  auto* filter_image = filter.mutable_data<uint16_t, cl::Image2D>(
      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
  LOG(INFO) << "launch";
@@ -459,13 +460,13 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
      default_converter->InitImageDimInfoWith(output.dims());
  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
            << output_image_shape[1];
-  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
+  auto* output_image = output.mutable_data<uint16_t, cl::Image2D>(
      output_image_shape[0], output_image_shape[1]);
  kernel->Launch();
  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
+  auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
  auto it = wait_list->find(out_ptr);
  if (it != wait_list->end()) {
    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -490,7 +491,8 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
  const size_t cl_image2d_row_pitch{0};
  const size_t cl_image2d_slice_pitch{0};
-  float* output_image_data = new float[output_image_shape.production() * 4];
+  uint16_t* output_image_data =
+      new uint16_t[output_image_shape.production() * 4];
  TargetWrapperCL::ImgcpySync(output_image_data,
                              output_image,
                              output_image_shape[0],
@@ -512,4 +514,4 @@ TEST(depthwise_conv2d_image2d_fp16, compute) {
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
+#include <memory>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+void ElementwiseAddCompute::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+  ele_param_ = param_.get_mutable<param_t>();
+  UpdateParams();
+}
+void ElementwiseAddCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
+  auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
+  auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
+      TARGET(kOpenCL));
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << TargetToStr(ele_param_->X->target());
+  VLOG(4) << TargetToStr(ele_param_->Y->target());
+  VLOG(4) << TargetToStr(ele_param_->Out->target());
+  int arg_idx = 0;
+  cl_int status = kernel.setArg(arg_idx, *x_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *y_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *out_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)batch_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)channels_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)num_);
+  CL_CHECK_FATAL(status);
+  auto global_work_size = cl::NDRange{channels_, batch_};
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_buf, event_);
+}
+void ElementwiseAddCompute::UpdateParams() {
+  auto axis = ele_param_->axis;
+  const auto& x_dims = ele_param_->X->dims();
+  const auto& y_dims = ele_param_->Y->dims();
+  const auto& out_dims = ele_param_->Out->dims();
+  if (axis < 0) {
+    axis = static_cast<int>(x_dims.size() - y_dims.size());
+  }
+  for (int i = 0; i < axis; ++i) {
+    batch_ *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels_ *= y_dims[i];
+  }
+  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
+    num_ *= x_dims[i];
+  }
+  VLOG(4) << "axis: " << axis;
+  VLOG(4) << "batch: " << batch_;
+  VLOG(4) << "channels: " << channels_;
+  VLOG(4) << "num: " << num_;
+}
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(
+    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
@@ -49,28 +49,6 @@ class ElementwiseAddCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
-class ElementwiseAddImageCompute
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void PrepareForRun() override;
-  void Run() override;
-  std::string doc() const override {
-    return "ElementwiseAdd using cl::Image2D, kFloat";
-  }
- protected:
-  param_t* ele_param_{nullptr};
-  std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{" -DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_compute.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/opencl/elementwise_add_compute.h"
+#include "lite/kernels/opencl/elementwise_add_image_compute.h"
 #include <memory>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
@@ -23,80 +23,6 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-/* Buffer */
-#if 0
-void ElementwiseAddCompute::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-  ele_param_ = param_.get_mutable<param_t>();
-  UpdateParams();
-}
-void ElementwiseAddCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
-  auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
-  auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
-      TARGET(kOpenCL));
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << TargetToStr(ele_param_->X->target());
-  VLOG(4) << TargetToStr(ele_param_->Y->target());
-  VLOG(4) << TargetToStr(ele_param_->Out->target());
-  int arg_idx = 0;
-  cl_int status = kernel.setArg(arg_idx, *x_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *y_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *out_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)batch_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)channels_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)num_);
-  CL_CHECK_FATAL(status);
-  auto global_work_size = cl::NDRange{channels_, batch_};
-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
-  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_buf, event_);
-}
-void ElementwiseAddCompute::UpdateParams() {
-  auto axis = ele_param_->axis;
-  const auto& x_dims = ele_param_->X->dims();
-  const auto& y_dims = ele_param_->Y->dims();
-  const auto& out_dims = ele_param_->Out->dims();
-  if (axis < 0) {
-    axis = static_cast<int>(x_dims.size() - y_dims.size());
-  }
-  for (int i = 0; i < axis; ++i) {
-    batch_ *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels_ *= y_dims[i];
-  }
-  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
-    num_ *= x_dims[i];
-  }
-  VLOG(4) << "axis: " << axis;
-  VLOG(4) << "batch: " << batch_;
-  VLOG(4) << "channels: " << channels_;
-  VLOG(4) << "num: " << num_;
-}
-#endif
-/* Image2D */
 void ElementwiseAddImageCompute::PrepareForRun() {
  ele_param_ = param_.get_mutable<param_t>();
  auto* x = ele_param_->X;
@@ -152,10 +78,10 @@ void ElementwiseAddImageCompute::Run() {
      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
-  auto* x_img = x->data<float, cl::Image2D>();
+  auto* x_img = x->data<uint16_t, cl::Image2D>();
-  auto* y_img = y->data<float, cl::Image2D>();
+  auto* y_img = y->data<uint16_t, cl::Image2D>();
-  auto* out_img =
+  auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
-      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+                                                           out_img_shape[1]);
  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
@@ -220,14 +146,7 @@ void ElementwiseAddImageCompute::Run() {
 namespace ocl = paddle::lite::kernels::opencl;
-// REGISTER_LITE_KERNEL(
+// TODO(ysh329): May need fix.
-//    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
-// TODO(ysh329): Not fix.
 // "Y" may from constant value like conv bias (kARM, need do cl_image_converter
 // on CPU);
 //     may from anther branch like "X" (kOpenCL, nothing to do).
@@ -235,20 +154,20 @@ namespace ocl = paddle::lite::kernels::opencl;
 //     set target of "Y" as kOpenCL temporarily.
 REGISTER_LITE_KERNEL(elementwise_add,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     ocl::ElementwiseAddImageCompute,
                     def)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("Y",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/elementwise_mul_compute.h
+++ b/lite/kernels/opencl/elementwise_mul_compute.h
@@ -15,7 +15,6 @@
 #include <memory>
 #include <string>
-#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
@@ -25,25 +24,25 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-class ElementwiseMulFloatImageCompute
+class ElementwiseAddImageCompute
    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
+                        PRECISION(kFP16),
                        DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ElementwiseParam;
-  std::string doc() const override {
-    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
-  }
  void PrepareForRun() override;
  void Run() override;
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Image2D, kFP16";
+  }
 protected:
  param_t* ele_param_{nullptr};
-  std::string kernel_func_name_{"elementwise_mul"};
+  std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/elementwise_add_image_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+template <typename dtype>
+void elementwise_compute_ref(const dtype *x_data,
+                             const dtype *y_data,
+                             dtype *out_data,
+                             const DDim &x_dims,
+                             const DDim &y_dims,
+                             int axis,
+                             const std::string elt_type,
+                             bool use_relu = false) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  VLOG(4) << "axis:" << axis;
+  VLOG(4) << "batch:" << batch;
+  VLOG(4) << "cahnnels:" << channels;
+  VLOG(4) << "num:" << num;
+  // do elementwise add/sub/max/...
+  if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      auto w = i % y_dims.production();
+      out_data[i] = x_data[i] + y_data[w];
+    }
+  } else if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype *din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype *dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          if (use_relu) {
+            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
+          }
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
+  }
+}
+// #define PRINT_RESULT
+// image
+TEST(elementwise_add_image, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
+               "elementwise_add(img) -> "
+               "layout(img2buf on cpu) "
+               "-> host";
+  // elementwise_add's 3 kernels selection routing strategy:
+  // --------------------------------------------------------
+  //  1. elementwise_add: Need y_dim.size() == 4
+  //  2. elementwise_add (used by fuse_elementwise_activation op):
+  //                      Need y_dim.size() == 4 && act_type == "relu"
+  //  3. width_add:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  3
+  //  4. channel_add:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  1
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  // y_dim / axis / relu_flag
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  std::vector<int> axis_v{-1, -1, 3, 1};
+  std::vector<bool> relu_flag_v{false, true, false, false};
+  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
+      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
+         "same, and be corresponding "
+         "one by one";
+  // start loop
+  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
+    auto y_dim = y_dim_v[case_idx];
+    auto axis = axis_v[case_idx];
+    auto relu_flag = relu_flag_v[case_idx];
+    LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
+              << "/" << y_dim_v.size() << " ===================";
+    LOG(INFO) << "x_dim:" << x_dim;
+    LOG(INFO) << "y_dim:" << y_dim;
+    LOG(INFO) << "out_dim:" << out_dim;
+    LOG(INFO) << "axis:" << axis;
+    LOG(INFO) << "relu_flag:" << relu_flag;
+    // tensor
+    VLOG(4) << "set tensors about op param";
+    lite::Tensor eleadd_x, eleadd_y, eleadd_out;
+    eleadd_x.Resize(x_dim);
+    eleadd_y.Resize(y_dim);
+    eleadd_out.Resize(out_dim);
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    eleadd_x.mutable_data<uint16_t, cl::Image2D>(
+        x_img_w, x_img_h, x_img_v.data());
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
+                                  4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    eleadd_y.mutable_data<uint16_t, cl::Image2D>(
+        y_img_w, y_img_h, y_img_v.data());
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    eleadd_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
+    std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<uint16_t>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+    std::vector<float> out_v(out_dim.production());
+    // operator param
+    operators::FusionElementwiseActivationParam
+        fuseEleaddParam;  // enabled if relu_flag is true
+    fuseEleaddParam.X = &eleadd_x;
+    fuseEleaddParam.Y = &eleadd_y;
+    fuseEleaddParam.Out = &eleadd_out;
+    fuseEleaddParam.axis = axis;
+    fuseEleaddParam.act_type = relu_flag ? "relu" : "";
+    operators::ElementwiseParam eleaddParam;
+    eleaddParam.X = &eleadd_x;
+    eleaddParam.Y = &eleadd_y;
+    eleaddParam.Out = &eleadd_out;
+    eleaddParam.axis = axis;
+    auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
+    // set kernel
+    auto eleadd_img_kernels =
+        KernelRegistry::Global().Create("elementwise_add",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFP16),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(eleadd_img_kernels.empty());
+    auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
+    VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+    eleadd_img_kernel->SetParam(op_param);
+    std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(eleadd_img_context->As<OpenCLContext>()));
+    eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
+    // run kernel
+    VLOG(4) << "run kernel";
+    eleadd_img_kernel->Launch();
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                eleadd_out.data<uint16_t, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   op_param.axis,
+                                   "add",
+                                   relu_flag);
+#ifdef PRINT_RESULT  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                << out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
+                << ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
+                << "]:" << ref_value;
+    }
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def);
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ElementwiseMulImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  std::string doc() const override {
+    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
+  }
+  void PrepareForRun() override {
+    ele_param_ = param_.get_mutable<param_t>();
+    auto* y = ele_param_->Y;
+    auto* x = ele_param_->X;
+    auto y_dims = y->dims();
+    auto x_dims = x->dims();
+    if (y_dims == x_dims) {
+      kernel_func_name_ = "elementwise_mul";
+    } else if (y_dims.size() == 1) {
+      kernel_func_name_ = "channel_mul_d1";
+    } else if (y_dims.size() == 2) {
+      if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
+        kernel_func_name_ = "channel_mul_d2_nc";
+      } else {
+        kernel_func_name_ = "channel_mul_d2_hw";
+      }
+    } else if (y_dims.size() == 4) {
+      kernel_func_name_ = "channel_mul_d4";
+    } else {
+      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
+                 << y_dims.size()
+                 << ", x_dims.size():" << ele_param_->X->dims().size();
+    }
+    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+    VLOG(4) << "y_dims:" << y_dims;
+    VLOG(4) << "y_dims.size():" << y_dims.size();
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+  }
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "y->target():" << TargetToStr(y->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << x->dims();
+    VLOG(4) << "y->dims():" << y->dims();
+    VLOG(4) << "out->dims():" << out->dims();
+    paddle::lite::CLImageConverterDefault default_convertor;
+    auto x_img_shape =
+        default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    auto x_img_width = x_img_shape[0];
+    auto x_img_height = x_img_shape[1];
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+    auto* x_img = x->data<uint16_t, cl::Image2D>();
+    auto* y_img = y->data<uint16_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<uint16_t, cl::Image2D>(out_img_shape[0],
+                                                             out_img_shape[1]);
+    VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+    VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+    VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+            << out_img_shape[1];
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int arg_idx = 0;
+    auto y_dims = y->dims();
+    auto x_dims = x->dims();
+    if (y_dims == x_dims) {
+      // kernel: elementwise_mul(channel_mul_d4)
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+    } else if (y_dims.size() == 1 || y_dims.size() == 4) {
+      auto tensor_w = x_dims[x_dims.size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      // kernel: channel_mul_d1 / channel_mul_d4
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
+    } else if (y_dims.size() == 2) {
+      if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
+        auto tensor_w = x_dims[x_dims.size() - 1];
+        VLOG(4) << "tensor_w:" << tensor_w;
+        // kernel: channel_mul_d2_nc
+        cl_int status = kernel.setArg(arg_idx, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+        CL_CHECK_FATAL(status);
+      } else {
+        auto y_tensor_h = y->dims()[0];
+        auto y_tensor_w = y->dims()[1];
+        VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+        // kernel: channel_mul_d2_hw
+        cl_int status = kernel.setArg(arg_idx, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
+        CL_CHECK_FATAL(status);
+      }
+    } else {
+      LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
+                 << y_dims.size();
+    }
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                    static_cast<cl::size_type>(x_img_height)};
+    auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+    VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+  }
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_mul"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseMulImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/elementwise_mul_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute_test.cc
@@ -111,7 +111,7 @@ void elementwise_compute_ref(const dtype *x_data,
 }
 // #define PRINT_RESULT
-TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
+TEST(elementwise_mul_image, compute) {
  LOG(INFO)
      << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
         "layout(img2buf on cpu) "
@@ -151,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
    auto x_img_w = x_img_shape[0];
    auto x_img_h = x_img_shape[1];
-    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    std::vector<uint16_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
-    elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+    elemul_x.mutable_data<uint16_t, cl::Image2D>(
+        x_img_w, x_img_h, x_img_v.data());
    // y
    std::vector<float> y_v(y_dim.production());
@@ -161,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
    auto y_img_w = y_img_shape[0];
    auto y_img_h = y_img_shape[1];
-    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    std::vector<uint16_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
+                                  4);  // 4: RGBA
    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
-    elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+    elemul_y.mutable_data<uint16_t, cl::Image2D>(
+        y_img_w, y_img_h, y_img_v.data());
    // out
    auto out_img_shape =
        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
    auto out_img_w = out_img_shape[0];
    auto out_img_h = out_img_shape[1];
-    elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+    elemul_out.mutable_data<uint16_t, cl::Image2D>(out_img_w, out_img_h);
-    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
+    std::vector<uint16_t> out_img_v(out_img_w * out_img_h * 4);
-    fill_data<float>(
+    fill_data<uint16_t>(
        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
    std::vector<float> out_v(out_dim.production());
@@ -189,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
    auto elemul_img_kernels =
        KernelRegistry::Global().Create("elementwise_mul",
                                        TARGET(kOpenCL),
-                                        PRECISION(kFloat),
+                                        PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault));
    ASSERT_FALSE(elemul_img_kernels.empty());
@@ -215,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
    const size_t cl_image2d_row_pitch{0};
    const size_t cl_image2d_slice_pitch{0};
    TargetWrapperCL::ImgcpySync(out_img_v.data(),
-                                elemul_out.data<float, cl::Image2D>(),
+                                elemul_out.data<uint16_t, cl::Image2D>(),
                                out_img_w,
                                out_img_h,
                                cl_image2d_row_pitch,
@@ -266,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
--- a/lite/kernels/opencl/fc_compute.cc
+++ b/lite/kernels/opencl/fc_compute.cc
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_compute_test.cc
@@ -66,8 +66,6 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
  }
 }
-// buffer
-#if 0  // fc_buffer
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(fc, compute) {
@@ -195,9 +193,8 @@ TEST(fc, compute) {
  }      // m
 #endif
 }
-#endif  // fc_buffer
 }  // namespace lite
 }  // namespace paddle
-// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+  void PrepareForRun() override {
+    build_options_ += " -DRELU";
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+    ele_param_ = param_.get_mutable<param_t>();
+    UpdateParams();
+    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
+    VLOG(4) << "act: " << act_t;
+    if (act_t != "relu") {
+      LOG(FATAL) << "Unsupported Activation type: " << act_t;
+    }
+  }
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     ocl::FusionElementwiseAddActivationCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
@@ -14,35 +14,13 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/elementwise_add_compute.h"
+#include "lite/kernels/opencl/elementwise_add_image_compute.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-/* Buffer */
-#if 0
-class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
- public:
-  using param_t = operators::FusionElementwiseActivationParam;
-  void PrepareForRun() override {
-    build_options_ += " -DRELU";
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-    ele_param_ = param_.get_mutable<param_t>();
-    UpdateParams();
-    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
-    VLOG(4) << "act: " << act_t;
-    if (act_t != "relu") {
-      LOG(FATAL) << "Unsupported Activation type: " << act_t;
-    }
-  }
-};
-#endif
 class FusionElementwiseAddActivationImageCompute
    : public ElementwiseAddImageCompute {
 public:
@@ -68,33 +46,23 @@ class FusionElementwiseAddActivationImageCompute
 }  // namespace paddle
 namespace ocl = paddle::lite::kernels::opencl;
-// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
-//                     kOpenCL,
-//                     kFloat,
-//                     kNCHW,
-//                     ocl::FusionElementwiseAddActivationCompute,
-//                     def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
 REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     ocl::FusionElementwiseAddActivationImageCompute,
                     def)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("Y",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
--- a/lite/kernels/opencl/io_copy_compute_test.cc
+++ b/lite/kernels/opencl/io_copy_compute_test.cc
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -47,7 +47,7 @@ class LayoutComputeBufferChwToImageDefault
    auto* x_data = param.x->data<float, cl::Buffer>();
    auto x_dims = param.x->dims();
    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* y_data = param.y->mutable_data<float, cl::Image2D>(
+    auto* y_data = param.y->mutable_data<uint16_t, cl::Image2D>(
        image_shape["width"], image_shape["height"]);
    auto y_dims = param.y->dims();
@@ -63,6 +63,8 @@ class LayoutComputeBufferChwToImageDefault
    const int Stride1 = out_H * out_W;
    const int Stride0 = out_W;
+    VLOG(4) << "y image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -121,12 +123,12 @@ class LayoutComputeBufferChwToImageDefault
  std::string doc() const override {
    return "Trans Layout from cl::Buffer(NCHW) to "
-           "cl::Image2D(ImageDefault/RGBA)";
+           "cl::Image2D(ImageDefault/RGBA), Float ---> FP16";
  }
 private:
  std::string kernel_func_name_{"buffer_to_image2d"};
-  std::string build_options_{"-DCL_DTYPE_float "};
+  std::string build_options_{"-DCL_DTYPE_float"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
@@ -144,16 +146,19 @@ class LayoutComputeImageDefaultToBufferChw
  void Run() override {
    auto& param = Param<param_t>();
+    auto* x_data = param.x->data<uint16_t, cl::Image2D>();
+    auto x_dims = param.x->dims();
    auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
    auto y_dims = param.y->dims();
-    auto* x_data = param.x->data<float, cl::Image2D>();
+    auto x_image_shape = InitImageDimInfoWith(x_dims);
-    auto x_dims = param.x->dims();
    std::vector<size_t> new_dims = {1, 1, 1, 1};
    for (int j = 0; j < x_dims.size(); ++j) {
      new_dims[4 - x_dims.size() + j] = x_dims[j];
    }
+    VLOG(4) << "x_image_shape(w,h):" << x_image_shape["width"] << " "
+            << x_image_shape["height"];
    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
@@ -212,7 +217,7 @@ class LayoutComputeImageDefaultToBufferChw
  std::string doc() const override {
    return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
-           "cl::Buffer(NCHW)";
+           "cl::Buffer(NCHW), FP16 ---> Float";
  }
 private:
@@ -340,23 +345,6 @@ REGISTER_LITE_KERNEL(
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kAny,
-    kImageDefault,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
-    NCHW_to_ImageDefault)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
 // [ImageDefault] -> [NCHW]
 REGISTER_LITE_KERNEL(
    layout,
@@ -374,38 +362,3 @@ REGISTER_LITE_KERNEL(
                                       PRECISION(kAny),
                                       DATALAYOUT(kNCHW))})
    .Finalize();
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kAny,
-    kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
-    ImageDefault_to_NCHW)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-// [NCHW] -> [ImageNW]
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kFloat,
-    kImageNW,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
-    NCHW_to_ImageNW)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageNW))})
-    .Finalize();
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -29,15 +29,15 @@ TEST(layout_ImageDefault, compute) {
               "-> device";
 #ifdef LOOP_TEST
-  for (int n = 1; n <= 100; n += 21) {
+  for (int n = 1; n <= 2; n += 1) {
    for (auto c : {1, 3}) {
-      for (int h = 1; h <= 100; h += 13) {
+      for (int h = 1; h <= 10; h += 1) {
-        for (int w = 1; w <= 100; w += 17) {
+        for (int w = 1; w <= 10; w += 1) {
 #else
-  const int n = 2;
+  const int n = 1;
-  const int c = 9;
+  const int c = 2;
-  const int h = 20;
+  const int h = 3;
-  const int w = 5;
+  const int w = 4;
 #endif  // LOOP_TEST
          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
@@ -79,14 +79,14 @@ TEST(layout_ImageDefault, compute) {
          auto* y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
          auto image_shape =
              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-          auto* y_image_data = y_image.mutable_data<float, cl::Image2D>(
+          auto* y_image_data = y_image.mutable_data<uint16_t, cl::Image2D>(
              image_shape["width"], image_shape["height"]);
          auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
              x_data, 0, sizeof(float) * x_dim.production()));
          auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
              y_data, 0, sizeof(float) * x_dim.production()));
          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(i);
+            mapped_x[i] = static_cast<float>(i) * 2;
          }
          // set context and kernel args
@@ -116,15 +116,16 @@ TEST(layout_ImageDefault, compute) {
 #ifdef PRINT_RESULT
          LOG(INFO) << "---- print result ----";
          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+            std::cout << mapped_x[eidx] << " -> "
-                      << std::endl;
+                      << static_cast<float>(mapped_y[eidx]) << std::endl;
          }
 #endif  // PRINT_RESULT
          // check result: compare input and output
+          float MAX_PASS_DIFF = 1e-4;
          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], 1e-6);
+            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
-            if (abs(mapped_x[eidx] - mapped_y[eidx]) > 1e-6) {
+            if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
                        << " / " << x_dim.production() << ", mapped_x[" << eidx
                        << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
@@ -147,6 +148,7 @@ TEST(layout_ImageDefault, compute) {
 #endif
 }
+#if 0
 TEST(layout_ImageNW, compute) {
 #ifdef LOOP_TEST
  for (int n = 1; n <= 100; n += 21) {
@@ -282,9 +284,11 @@ TEST(layout_ImageNW, compute) {
 // nothing to do.
 #endif
 }
+#endif
 }  // namespace lite
 }  // namespace paddle
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
+// USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
--- a/lite/kernels/opencl/mul_compute.cc
+++ b/lite/kernels/opencl/mul_compute.cc
@@ -102,7 +102,7 @@ class MulCompute
 private:
  int m_, n_, k_;
  std::string kernel_func_name_{"mat_mul"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };

--- a/lite/kernels/opencl/mul_compute_test.cc
+++ b/lite/kernels/opencl/mul_compute_test.cc
--- a/lite/kernels/opencl/nearest_interp_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_compute.cc
@@ -24,90 +24,7 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-class NearestInterpComputeFloatImageDefault
+class NearestInterpComputeImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::InterpolateParam;
-  std::string doc() const override {
-    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto* out_buf =
-        param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    float scale_h = y_dims[2] / x_dims[2];
-    float scale_w = y_dims[3] / x_dims[3];
-    int in_dims_h = x_dims[2];
-    int out_dims_h = y_dims[2];
-    int in_dims_w = x_dims[3];
-    int out_dims_w = y_dims[3];
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
-    CL_CHECK_FATAL(status);
-    paddle::lite::CLImageConverterDefault default_convertor;
-    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims);  // w, h
-    auto y_img_width = y_img_shape[0];
-    LOG(INFO) << "y_img_width:" << y_img_width;
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
-                    static_cast<cl::size_type>(y_dims[3]),
-                    static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
- private:
-  std::string kernel_func_name_{"nearest_interp"};
-  std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-class NearestInterpComputeFP16ImageDefault
    : public KernelLite<TARGET(kOpenCL),
                        PRECISION(kFP16),
                        DATALAYOUT(kImageDefault)> {
@@ -128,11 +45,11 @@ class NearestInterpComputeFP16ImageDefault
    auto& param = *param_.get_mutable<param_t>();
    const auto& x_dims = param.X->dims();
    auto* x_buf =
-        param.X->data<int16_t,
+        param.X->data<uint16_t,
-                      cl::Image2D>();  // use int16_t represents half float
+                      cl::Image2D>();  // use uint16_t represents half float
    auto image_shape = InitImageDimInfoWith(x_dims);
    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+        param.Out->mutable_data<uint16_t, cl::Image2D>(  // use uint16_t
            // represents half float
            image_shape["width"],
            image_shape["height"]);
@@ -204,29 +121,12 @@ class NearestInterpComputeFP16ImageDefault
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_KERNEL(
-    nearest_interp,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
 REGISTER_LITE_KERNEL(
    nearest_interp,
    kOpenCL,
    kFP16,
    kImageDefault,
-    paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeImageDefault,
    ImageDefault)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),

--- a/lite/kernels/opencl/nearest_interp_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_compute_test.cc
@@ -60,7 +60,7 @@ void nearest_interp_compute_ref(const dtype *src,
 }
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(nearest_interp_image2d_fp32, compute) {
+TEST(nearest_interp_image2d, compute) {
  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
               "nearest_interp(img) -> "
               "layout(img2buf) "
@@ -105,7 +105,7 @@ TEST(nearest_interp_image2d_fp32, compute) {
              auto nearest_interp_img_kernels =
                  KernelRegistry::Global().Create("nearest_interp",
                                                  TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                  DATALAYOUT(kImageDefault));
              ASSERT_FALSE(buf_to_img_kernels.empty());
              ASSERT_FALSE(buf_to_img_kernels.empty());
@@ -166,12 +166,12 @@ TEST(nearest_interp_image2d_fp32, compute) {
                mapped_y[i] = static_cast<int>(0);
              }
              auto *nearest_interp_in_data =
-                  nearest_interp_in.mutable_data<float, cl::Image2D>(
+                  nearest_interp_in.mutable_data<uint16_t, cl::Image2D>(
                      nearest_interp_image2d_shape["width"],
                      nearest_interp_image2d_shape["height"]);
              auto *nearest_interp_out_data =
-                  nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
+                  nearest_interp_out.mutable_data<uint16_t, cl::Image2D>(
-                                                                      y_dim[2]);
+                      y_dim[3], y_dim[2]);
              // set context and kernel args
              LOG(INFO) << "set context and kernel args";
@@ -273,13 +273,9 @@ TEST(nearest_interp_image2d_fp32, compute) {
 }  // namespace lite
 }  // namespace paddle
-// nearest_interp buffer
-// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
 // nearest_interp image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
 // nearest_interp image2d fp16
 USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/opencl/pool_buffer_compute.cc
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class PoolCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::PoolParam;
+  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    kernel_func_name_ += param.pooling_type;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
+  }
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = *param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_buf = param.x->data<float, cl::Buffer>();
+    auto* output_buf =
+        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    cl_int status;
+    auto numel = out_dims.production();
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_buf);
+    CL_CHECK_FATAL(status);
+    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_buf, event_);
+  }
+ private:
+  std::string kernel_func_name_{"pool_"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(pool2d,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::PoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
--- a/lite/kernels/opencl/pool_buffer_compute_test.cc
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+void pool_avg(const int padding_height,
+              const int padding_width,
+              const int stride_height,
+              const int stride_width,
+              const int ksize_height,
+              const int ksize_width,
+              const float* input_data,
+              const DDim& in_dim,
+              float* output_data,
+              const DDim& out_dim) {
+  const int batch_size = in_dim[0];
+  const int input_height = in_dim[2];
+  const int input_width = in_dim[3];
+  const int output_channels = out_dim[1];
+  const int output_height = out_dim[2];
+  const int output_width = out_dim[3];
+  const size_t input_spatial_size = input_height * input_width;
+  const size_t output_spatial_size = output_height * output_width;
+  for (int i = 0; i < batch_size; i++) {
+    for (int c = 0; c < output_channels; ++c) {
+      int channel = i * output_channels + c;
+      const float* input_ptr = input_data + channel * input_spatial_size;
+      float* output_ptr = output_data + channel * output_spatial_size;
+      for (int ph = 0; ph < output_height; ++ph) {
+        int hstart = ph * stride_height - padding_height;
+        int hend = std::min(hstart + ksize_height, input_height);
+        hstart = std::max(hstart, 0);
+        for (int pw = 0; pw < output_width; ++pw) {
+          int wstart = pw * stride_width - padding_width;
+          int wend = std::min(wstart + ksize_width, input_width);
+          wstart = std::max(wstart, 0);
+          float val = 0.f;
+          int count = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              val += input_ptr[h * input_width + w];
+              ++count;
+            }
+          }
+          output_ptr[ph * output_width + pw] =
+              (count > 0) ? val * (1.f / count) : 0.f;
+        }
+      }
+    }
+  }
+}
+TEST(pool2d_buffer_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "get kernel:" << kernel->doc();
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = true;
+  param.pooling_type = "avg";
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pool_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pool_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pool_context));
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+  auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  auto* mapped_x = static_cast<float*>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
+  for (int i = 0; i < in_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+  kernel->Launch();
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+  auto* out_data = out.mutable_data<float, cl::Buffer>();
+  auto* mapped_out = static_cast<float*>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+}
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -26,107 +26,13 @@ namespace lite {
 namespace kernels {
 namespace opencl {
-class PoolCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::PoolParam;
-  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    kernel_func_name_ += param.pooling_type;
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
-  }
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    const auto& out_dims = param.output->dims();
-    const std::string pooling_type = param.pooling_type;
-    const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = *param.paddings;
-    std::vector<int> strides = param.strides;
-    std::vector<int> ksize = param.ksize;
-    if (global_pooling) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[2 * i] = 0;
-        paddings[2 * i + 1] = 0;
-        ksize[i] = static_cast<int>(in_dims[i + 2]);
-      }
-    }
-    bool pads_equal =
-        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
-    if (!pads_equal) {
-      LOG(FATAL)
-          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_buf = param.x->data<float, cl::Buffer>();
-    auto* output_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    cl_int status;
-    auto numel = out_dims.production();
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_buf);
-    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
-  }
- private:
-  std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
 class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
-                                             PRECISION(kFloat),
+                                             PRECISION(kFP16),
                                             DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::PoolParam;
-  std::string doc() const override { return "Pool using cl::Image2D, kFloat"; }
+  std::string doc() const override { return "Pool using cl::Image2D, kFP16"; }
  void PrepareForRun() override {
    const auto& param = *param_.get_mutable<param_t>();
@@ -161,13 +67,13 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
    auto& context = ctx_->As<OpenCLContext>();
    CHECK(context.cl_context() != nullptr);
-    auto* x_img = param.x->data<float, cl::Image2D>();
+    auto* x_img = param.x->data<uint16_t, cl::Image2D>();
    LOG(INFO) << "x_image" << x_img;
    auto out_image_shape = InitImageDimInfoWith(out_dims);
    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
              << out_image_shape["height"];
-    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+    auto* out_img = param.output->mutable_data<uint16_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
    LOG(INFO) << "out_image" << out_img;
@@ -220,7 +126,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 private:
  std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
@@ -229,28 +135,18 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 }  // namespace lite
 }  // namespace paddle
-// REGISTER_LITE_KERNEL(pool2d,
-//                      kOpenCL,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::PoolCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
 REGISTER_LITE_KERNEL(pool2d,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     paddle::lite::kernels::opencl::PoolComputeImage2D,
                     image2d)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#define FP16_MAX_DIFF (5e-1)
 namespace paddle {
 namespace lite {
@@ -73,82 +76,10 @@ void pool_avg(const int padding_height,
  }
 }
-// buffer
+TEST(pool2d_image2d, compute) {
-#if 0   // pool_buffer
-TEST(pool2d_buffer_fp32, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
-      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      "pool2d", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  LOG(INFO) << "get kernel:" << kernel->doc();
-  lite::Tensor x, out;
-  operators::PoolParam param;
-  param.x = &x;
-  param.output = &out;
-  param.global_pooling = true;
-  param.pooling_type = "avg";
-  std::vector<int> paddings = {0, 0, 0, 0};
-  param.strides = std::vector<int>{1, 1};
-  param.ksize = std::vector<int>{7, 7};
-  param.paddings = std::make_shared<std::vector<int>>(paddings);
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pool_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(pool_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(pool_context));
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  x.Resize(in_dim);
-  out.Resize(out_dim);
-  auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-  auto* mapped_x = static_cast<float*>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
-  for (int i = 0; i < in_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-  kernel->Launch();
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
-  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-  auto* out_data = out.mutable_data<float, cl::Buffer>();
-  auto* mapped_out = static_cast<float*>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-}
-#endif  // pool_buffer
-TEST(pool2d_image2d_fp32, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
  ASSERT_FALSE(kernels.empty());
  auto kernel = std::move(kernels.front());
@@ -192,22 +123,23 @@ TEST(pool2d_image2d_fp32, compute) {
  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
            << x_image_shape[1];
-  std::vector<float> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  std::vector<uint16_t> x_image_data(x_image_shape.production() *
+                                     4);  // 4 : RGBA
  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
-  auto* x_image = x.mutable_data<float, cl::Image2D>(
+  auto* x_image = x.mutable_data<uint16_t, cl::Image2D>(
      x_image_shape[0], x_image_shape[1], x_image_data.data());
  LOG(INFO) << "x_image:" << x_image;
  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
            << out_image_shape[1];
-  auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0],
+  auto* out_image = out.mutable_data<uint16_t, cl::Image2D>(out_image_shape[0],
                                                            out_image_shape[1]);
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
  auto it = wait_list->find(out_ptr);
  if (it != wait_list->end()) {
    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -222,7 +154,7 @@ TEST(pool2d_image2d_fp32, compute) {
  const size_t cl_image2d_row_pitch{0};
  const size_t cl_image2d_slice_pitch{0};
-  float* out_image_data = new float[out_image_shape.production() * 4];
+  uint16_t* out_image_data = new uint16_t[out_image_shape.production() * 4];
  TargetWrapperCL::ImgcpySync(out_image_data,
                              out_image,
                              out_image_shape[0],
@@ -235,12 +167,22 @@ TEST(pool2d_image2d_fp32, compute) {
      out_image_data, out_data, out_image_shape, out_dim);
  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
  }
 }
 }  // namespace lite
 }  // namespace paddle
-// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFP16, kImageDefault, image2d);
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/relu_compute.cc
+++ b/lite/kernels/opencl/relu_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-class ReluCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ActivationParam;
-  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    size_t count = x_dims.production();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.X->data<float, cl::Buffer>();
-    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, (const int)count);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange{count};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-class ReluComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-  std::string doc() const override {
-    return "Relu using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-class ReluComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-  std::string doc() const override {
-    return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf =
-        param.X->data<int16_t,
-                      cl::Image2D>();  // use int16_t represents half float
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
-                                                        // represents half float
-            image_shape["width"],
-            image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_half -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-class Relu6ComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-  std::string doc() const override {
-    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto threshold = param.Relu_clipped_coef;
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold);
-    CL_CHECK_FATAL(status);
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "threshold:" << threshold;
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
- private:
-  std::string kernel_func_name_{"relu6"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU6"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-class Relu6ComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-  std::string doc() const override {
-    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
-  }
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<int16_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto threshold = param.Relu_clipped_coef;
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold);
-    CL_CHECK_FATAL(status);
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "threshold:" << threshold;
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
- private:
-  std::string kernel_func_name_{"relu6"};
-  std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-// REGISTER_LITE_KERNEL(relu,`
-//                     kOpenCL,
-//                     kFloat,
-//                     kNCHW,
-//                     paddle::lite::kernels::opencl::ReluCompute,
-//                     def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
-REGISTER_LITE_KERNEL(
-    relu,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-REGISTER_LITE_KERNEL(relu,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     paddle::lite::kernels::opencl::ReluComputeFP16ImageDefault,
-                     ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-// Relu6
-REGISTER_LITE_KERNEL(
-    relu6,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::Relu6ComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    relu6,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::Relu6ComputeFP16ImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
--- a/lite/kernels/opencl/reshape_compute.cc
+++ b/lite/kernels/opencl/reshape_compute.cc
@@ -27,7 +27,7 @@ namespace opencl {
 // reshape operator
 class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
-                                                   PRECISION(kFloat),
+                                                   PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ReshapeParam;
@@ -51,7 +51,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
    const int64_t& input_image_width = input_image_shape.at("width");
    const int64_t& input_image_height = input_image_shape.at("height");
-    const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
+    const cl::Image2D* const x_image = x->data<uint16_t, cl::Image2D>();
    const std::vector<int>& shape_vct = param.shape_vct;
    Tensor* const output = param.output;
@@ -60,7 +60,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
    const std::map<std::string, size_t>& out_image_shape =
        InitImageDimInfoWith(out_dims);
-    cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
+    cl::Image2D* const out_image = output->mutable_data<uint16_t, cl::Image2D>(
        out_image_shape.at("width"), out_image_shape.at("height"));
    LOG(INFO) << "out_dims=   " << out_dims;
@@ -159,7 +159,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
 private:
  std::string kernel_func_name_{"reshape"};
-  std::string build_options_{"-DCL_DTYPE_float "};
+  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
@@ -170,37 +170,37 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
 REGISTER_LITE_KERNEL(reshape,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
                     image2d)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
 REGISTER_LITE_KERNEL(reshape2,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
                     image2d)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/reshape_compute_test.cc
+++ b/lite/kernels/opencl/reshape_compute_test.cc
@@ -17,9 +17,12 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
 #include "lite/operators/reshape_op.h"
 #include "lite/utils/logging.h"
+#define FP16_MAX_DIFF (5e-1)
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -81,7 +84,7 @@ static DDim ValidateShape(const std::vector<int>& shape,
 TEST(reshape_opencl, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
-      "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+      "reshape", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
  ASSERT_FALSE(kernels.empty());
  auto kernel = std::move(kernels.front());
@@ -149,13 +152,13 @@ TEST(reshape_opencl, compute) {
  }
  paddle::lite::CLImageConverterDefault default_convertor;
-  std::vector<float> x_image_data(input_image_width * input_image_height *
+  std::vector<uint16_t> x_image_data(input_image_width * input_image_height *
                                     4);  // 4 : RGBA
  LOG(INFO) << "set mapped input  ...";
  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
-  auto* input_image = input.mutable_data<float, cl::Image2D>(
+  auto* input_image = input.mutable_data<uint16_t, cl::Image2D>(
      input_image_width, input_image_height, x_image_data.data());
  LOG(INFO) << "prepare kernel ready";
@@ -165,8 +168,8 @@ TEST(reshape_opencl, compute) {
  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
            << out_image_shape[1];
-  auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
+  auto* out_image = output.mutable_data<uint16_t, cl::Image2D>(
-                                                            out_image_shape[1]);
+      out_image_shape[0], out_image_shape[1]);
  VLOG(4) << "out_dims= " << output_dim;
  LOG(INFO) << "kernel context ...";
@@ -182,7 +185,7 @@ TEST(reshape_opencl, compute) {
  kernel->Launch();
  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
  auto it = wait_list->find(out_image);
  if (it != wait_list->end()) {
@@ -193,9 +196,9 @@ TEST(reshape_opencl, compute) {
    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
  }
-  float* out_image_data = new float[out_image_shape.production() * 4];
+  uint16_t* out_image_data = new uint16_t[out_image_shape.production() * 4];
  TargetWrapperCL::ImgcpySync(out_image_data,
-                              output.data<float, cl::Image2D>(),
+                              output.data<uint16_t, cl::Image2D>(),
                              out_image_shape[0],
                              out_image_shape[1],
                              cl_image2d_row_pitch,
@@ -211,9 +214,17 @@ TEST(reshape_opencl, compute) {
  // check output data
  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
+    auto abs_diff = abs(out_data[i] - input_v_data[i]);
-    if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], input_v_data[i]);
-      LOG(INFO) << "error idx:" << i;
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "input_v_data["
+                 << i << "]:" << input_v_data[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
    }
  }
 }
@@ -223,5 +234,5 @@ TEST(reshape_opencl, compute) {
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape, kOpenCL, kFP16, kImageDefault, image2d);
-USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape2, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/scale_compute.cc
+++ b/lite/kernels/opencl/scale_compute.cc
@@ -27,12 +27,12 @@ namespace kernels {
 namespace opencl {
 class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                              DATALAYOUT(kImageDefault)> {
 public:
  using param_t = operators::ScaleParam;
-  std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
+  std::string doc() const override { return "Scale using cl::Image2D, kFP16"; }
  void PrepareForRun() override {
    auto& context = ctx_->As<OpenCLContext>();
@@ -43,7 +43,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
  void Run() override {
    const auto& param = *param_.get_mutable<param_t>();
    const auto& in_dims = param.x->dims();
-    auto* x_img = param.x->data<float, cl::Image2D>();
+    auto* x_img = param.x->data<uint16_t, cl::Image2D>();
    const float scale = param.scale;
    const float bias = param.bias;
@@ -51,7 +51,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
    auto out_image_shape = InitImageDimInfoWith(in_dims);
    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
              << out_image_shape["height"];
-    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+    auto* out_img = param.output->mutable_data<uint16_t, cl::Image2D>(
        out_image_shape["width"], out_image_shape["height"]);
    LOG(INFO) << "out_image" << out_img;
@@ -89,7 +89,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 private:
  std::string kernel_func_name_{"scale"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string build_options_{"-DCL_DTYPE_half"};
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
@@ -100,16 +100,16 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 REGISTER_LITE_KERNEL(scale,
                     kOpenCL,
-                     kFloat,
+                     kFP16,
                     kImageDefault,
                     paddle::lite::kernels::opencl::ScaleComputeImage2D,
                     image2d)
    .BindInput("X",
               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                      DATALAYOUT(kImageDefault))})
    .BindOutput("Out",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/scale_compute_test.cc
+++ b/lite/kernels/opencl/scale_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#define FP16_MAX_DIFF (5e-1)
 namespace paddle {
 namespace lite {
@@ -35,7 +38,7 @@ void scale(const float* input_data,
 TEST(scale_image2d_fp32, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
-      "scale", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+      "scale", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
  ASSERT_FALSE(kernels.empty());
  auto kernel = std::move(kernels.front());
@@ -74,19 +77,19 @@ TEST(scale_image2d_fp32, compute) {
  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
  DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
  LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
-  std::vector<float> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  std::vector<uint16_t> x_image_data(image_shape.production() * 4);  // 4 : RGBA
  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
-  auto* x_image = x.mutable_data<float, cl::Image2D>(
+  auto* x_image = x.mutable_data<uint16_t, cl::Image2D>(
      image_shape[0], image_shape[1], x_image_data.data());
  LOG(INFO) << "x_image:" << x_image;
  auto* out_image =
-      out.mutable_data<float, cl::Image2D>(image_shape[0], image_shape[1]);
+      out.mutable_data<uint16_t, cl::Image2D>(image_shape[0], image_shape[1]);
  LOG(INFO) << "out_image:" << out_image;
  kernel->Launch();
  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<uint16_t, cl::Image2D>();
  auto it = wait_list->find(out_ptr);
  if (it != wait_list->end()) {
    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -101,7 +104,7 @@ TEST(scale_image2d_fp32, compute) {
  const size_t cl_image2d_row_pitch{0};
  const size_t cl_image2d_slice_pitch{0};
-  float* out_image_data = new float[image_shape.production() * 4];
+  uint16_t* out_image_data = new uint16_t[image_shape.production() * 4];
  TargetWrapperCL::ImgcpySync(out_image_data,
                              out_image,
                              image_shape[0],
@@ -114,11 +117,22 @@ TEST(scale_image2d_fp32, compute) {
      out_image_data, out_data, image_shape, out_dim);
  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
  }
 }
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(scale, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(scale, kOpenCL, kFP16, kImageDefault, image2d);
--- a/lite/kernels/opencl/sigmoid_compute_test.cc
+++ b/lite/kernels/opencl/sigmoid_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <gtest/gtest.h>
-#include <math.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/kernels/opencl/image_helper.h"
-namespace paddle {
-namespace lite {
-template <typename dtype>
-void sigmoid_compute_ref(const dtype *x_data,
-                         const DDim &x_dim,
-                         dtype *out_data) {
-  for (int i = 0; i < x_dim.production(); ++i) {
-    out_data[i] = 1 / (1 + expf(-x_data[i]));
-  }
-}
-// buffer
-#if 0   // sigmoid_buffer
-TEST(opencl_sigmoid_buffer, compute) {
-  // prepare data
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
-  lite::Tensor x, out;
-  x.Resize(x_dim);
-  out.Resize(x_dim);
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-  // set param and kernel, then run
-  operators::ActivationParam param;
-  param.X = &x;
-  param.Out = &out;
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(sigmoid_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(sigmoid_context));
-  kernel->Launch();
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-  // run compute ref and check
-  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
-  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-}
-#endif  // sigmoid_buffer
-#define LOOP_TEST
-// #define PRINT_RESULT
-TEST(sigmoid_image2d_fp32, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
-               "layout(img2buf) "
-               "-> host";
-#ifdef LOOP_TEST
-  for (int n = 1; n <= 9; n += 3) {
-    for (auto c : {1, 3, 9}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 3;
-  const int c = 9;
-  const int h = 51;
-  const int w = 11;
-#endif  // LOOP_TEST
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto sigmoid_img_kernels =
-              KernelRegistry::Global().Create("sigmoid",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(sigmoid_img_kernels.empty());
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> sigmoid_in
-          // sigmoid(img): sigmoid_in -> sigmoid_out
-          // layout(img->buf): sigmoid_out -> y
-          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &sigmoid_in;
-          ImageToBufferParam.x = &sigmoid_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam SigmoidParam;
-          SigmoidParam.X = &sigmoid_in;
-          SigmoidParam.Out = &sigmoid_out;
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          sigmoid_in.Resize(x_dim);
-          sigmoid_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto sigmoid_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          std::default_random_engine engine;
-          std::uniform_real_distribution<float> dist(-1, 1);
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(dist(engine));
-          }
-          auto *sigmoid_in_data = sigmoid_in.mutable_data<float, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-          auto *sigmoid_out_data = sigmoid_out.mutable_data<float, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-          sigmoid_img_kernel->SetParam(SigmoidParam);
-          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(sigmoid_img_context->As<OpenCLContext>()));
-          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          sigmoid_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-          // compute ref cpu
-          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // PRINT_RESULT
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx] << ", mapped_x["
-                        << eidx << "]: " << mapped_x[eidx];
-              break;
-            }
-          }
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-#define SIGMOID_FP16_LOOP_TEST
-// #define SIGMOID_FP16_PRINT_RESULT
-TEST(sigmoid_image2d_fp16, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
-               "layout(img2buf) "
-               "-> host";
-#ifdef SIGMOID_FP16_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // SIGMOID_FP16_LOOP_TEST
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto sigmoid_img_kernels =
-              KernelRegistry::Global().Create("sigmoid",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFP16),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(sigmoid_img_kernels.empty());
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> sigmoid_in
-          // sigmoid(img): sigmoid_in -> sigmoid_out
-          // layout(img->buf): sigmoid_out -> y
-          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &sigmoid_in;
-          ImageToBufferParam.x = &sigmoid_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam SigmoidParam;
-          SigmoidParam.X = &sigmoid_in;
-          SigmoidParam.Out = &sigmoid_out;
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          sigmoid_in.Resize(x_dim);
-          sigmoid_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto sigmoid_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          std::default_random_engine engine;
-          std::uniform_real_distribution<float> dist(-1, 1);
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(dist(engine));
-          }
-          auto *sigmoid_in_data = sigmoid_in.mutable_data<int16_t, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-          auto *sigmoid_out_data =
-              sigmoid_out.mutable_data<int16_t, cl::Image2D>(
-                  sigmoid_image2d_shape["width"],
-                  sigmoid_image2d_shape["height"]);
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-          sigmoid_img_kernel->SetParam(SigmoidParam);
-          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(sigmoid_img_context->As<OpenCLContext>()));
-          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: sigmoid_img_kernel";
-          sigmoid_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-          // compute ref cpu
-          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef SIGMOID_FP16_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // SIGMOID_FP16_PRINT_RESULT
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
-                        << eidx << "]: " << mapped_x[eidx];
-              break;
-            }
-          }
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef SIGMOID_FP16_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-}  // namespace lite
-}  // namespace paddle
-// sigmoid buffer
-// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
-// sigmoid image2d fp32
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kImageDefault, ImageDefault);
-// sigmoid image2d fp16
-USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/opencl/test_helper.h
+++ b/lite/kernels/opencl/test_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#pragma once
+#define COMPTUE_ABS_DIFF(res0, res1) abs(res0 - res1)
+#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
+#define IS_DIFF_PASSED(res0, res1, threshold)        \
+  (((COMPTUE_ABS_DIFF(res0, res1) < threshold) ||    \
+    (COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
+       ? true                                        \
+       : false)