fix elementwise_add acc bugs.

19bea13c · ZhenWang · 1fbd77d4 · 19bea13c · 19bea13c · 19bea13c
17 changed file
--- a/paddle/fluid/lite/api/mobilenetv1_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places,
                              3.13812525e-05, 6.52209565e-05, 4.78087313e-05,
                              2.58822285e-04});
  for (int i = 0; i < results.size(); ++i) {
-    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-6);
  }
  ASSERT_EQ(out->dims().size(), 2);
  ASSERT_EQ(out->dims()[0], 1);

--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> {

  void CopySharedTo(const OpenClContext* ctx) {
    ctx->cl_context_ = cl_context_;
+    ctx->cl_helper_ = cl_helper_;
  }

 private:
  void PrepareKernels() {
    cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+    cl_helper_->AddKernel("channel_add", "channel_add_kernel.cl");
    cl_helper_->AddKernel("pool_max", "pool_kernel.cl");
+    cl_helper_->AddKernel("pool_avg", "pool_kernel.cl");
  }
 };
 #endif

--- a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
@@ -31,10 +31,10 @@ class ElementwiseAddCompute
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
    auto& context = ctx_->As<OpenClContext>();
-    CHECK(context.cl_context());
+    CHECK(context.cl_helper() != nullptr);

    elementwise_add(
-        context.cl_context(), static_cast<const float*>(param.X->raw_data()),
+        context.cl_helper(), static_cast<const float*>(param.X->raw_data()),
        param.X->dims(), static_cast<const float*>(param.Y->raw_data()),
        param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims());
  }

--- a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
@@ -40,9 +40,9 @@ TEST(elementwise_add, init) {
  kernel->SetParam(param);
  kernel->SetContext(std::move(context));

-  X.Resize({1, 10});
-  Y.Resize({1, 10});
-  Out.Resize({1, 10});
+  X.Resize({1, 1, 1, 10});
+  Y.Resize({1, 1, 1, 10});
+  Out.Resize({1, 1, 1, 10});

  auto* x_data = X.mutable_data<float>();
  auto* y_data = Y.mutable_data<float>();
@@ -56,7 +56,7 @@ TEST(elementwise_add, init) {
  kernel->Launch();

  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(out_data[i], 3.4 * i, 1e-1);
+    EXPECT_NEAR(out_data[i], 3.4 * i, 1e-6);
  }
 }


--- a/paddle/fluid/lite/opencl/CMakeLists.txt
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
@@ -5,13 +5,11 @@ endif()
 cc_library(cl_wrapper SRCS cl_wrapper.cc)
 cc_library(cl_tool SRCS cl_tool.cc)
 target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
-cc_library(cl_half SRCS cl_half.cc)
-target_compile_options(cl_half BEFORE PUBLIC -fno-strict-aliasing)
 cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
 cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
 cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
-cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
-cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
+cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor)
+cc_library(cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine)
 cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_helper cl_image)
 lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper)
 add_dependencies(cl_tool opencl_clhpp)
--- a/paddle/fluid/lite/opencl/cl_caller.cc
+++ b/paddle/fluid/lite/opencl/cl_caller.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/lite/opencl/cl_caller.h"
 #include <string>
 #include "paddle/fluid/lite/core/compatible_tensor.h"
-#include "paddle/fluid/lite/opencl/cl_context.h"
 #include "paddle/fluid/lite/opencl/cl_engine.h"
 #include "paddle/fluid/lite/opencl/cl_helper.h"
 #include "paddle/fluid/lite/opencl/cl_image.h"
@@ -23,16 +22,17 @@ limitations under the License. */

 namespace paddle {
 namespace lite {
-static void CopyImageData(const CLImage& cl_image, float* out) {
+static void CopyImageData(CLHelper* helper, const CLImage& cl_image,
+                          float* out) {
  int width = cl_image.image_dims()[0];
  int height = cl_image.image_dims()[1];

-  half_t* image_data = new half_t[height * width * 4];
+  float* image_data = new float[height * width * 4];
  cl::Image* image = cl_image.cl_image();
  const std::array<size_t, 3> origin{0, 0, 0};
  const std::array<size_t, 3> region{static_cast<size_t>(width),
                                     static_cast<size_t>(height), 1};
-  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+  cl_int err = helper->OpenCLCommandQueue().enqueueReadImage(
      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
  CL_CHECK_ERRORS(err);

@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) {
  return engine->IsInitSuccess();
 }

-void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
+void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
                     const float* bias, const DDim& bias_dim, float* out,
                     const DDim& out_dim) {
-  CLHelper helper(context);
-  helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
-  auto kernel = helper.GetKernel(0);
+  if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
+    LOG(FATAL) << "Error: bias dims is error";
+    return;
+  }
+  auto kernel = bias_dim.size() == 1 ? helper->GetKernel("channel_add")
+                                     : helper->GetKernel("elementwise_add");
  CLImage in_image;
  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(helper.OpenCLContext());
+  in_image.InitNormalCLImage(helper->OpenCLContext());
  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
  CLImage bias_image;
  bias_image.set_tensor_data(bias, bias_dim);
-  bias_image.InitNormalCLImage(helper.OpenCLContext());
+  bias_image.InitCLImage(helper->OpenCLContext());
  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
  CLImage out_image;
-  out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
  cl_int status;
  status = kernel.setArg(0, *in_image.cl_image());
  CL_CHECK_ERRORS(status);
@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
  CL_CHECK_ERRORS(status);
  status = kernel.setArg(2, *out_image.cl_image());
  CL_CHECK_ERRORS(status);
+
+  if (bias_dim.size() == 1) {
+    int tensor_w = in_dim[3];
+    status = kernel.setArg(3, tensor_w);
+    CL_CHECK_ERRORS(status);
+  }
  size_t width = in_image.ImageWidth();
  size_t height = in_image.ImageHeight();
  auto global_work_size = cl::NDRange{width, height};
-  status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
  CL_CHECK_ERRORS(status);

+  status = helper->OpenCLCommandQueue().finish();
+  CL_CHECK_ERRORS(status);
  VLOG(3) << " --- Out image: " << out_image << " --- ";
-
-  CopyImageData(out_image, out);
+  CopyImageData(helper, out_image, out);
 }

 }  // namespace lite

--- a/paddle/fluid/lite/opencl/cl_caller.h
+++ b/paddle/fluid/lite/opencl/cl_caller.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <string>
 #include "paddle/fluid/lite/core/compatible_tensor.h"
-#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"

 namespace paddle {
 namespace lite {
@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path);
 /// black box so that the framework can remain simple.
 /// NOTE Currently, these methods are quite expensive, we will optimize them
 /// latter.
-void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
+void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
                     const float* bias, const DDim& bias_dim, float* out,
                     const DDim& out_dim);


--- a/paddle/fluid/lite/opencl/cl_engine.cc
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() {
  if (ext_data.find("cl_khr_fp16") != std::string::npos) {
    LOG(INFO) << "The chosen device supports the half data type.";
  } else {
-    LOG(ERROR) << "The chosen device doesn't support the half data type!";
-    return false;
+    LOG(INFO) << "The chosen device doesn't support the half data type!";
  }
  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
  LOG(INFO) << "The chosen device has " << max_units << " compute units.";

--- a/paddle/fluid/lite/opencl/cl_half.cc
+++ b/paddle/fluid/lite/opencl/cl_half.cc
--- a/paddle/fluid/lite/opencl/cl_image.cc
+++ b/paddle/fluid/lite/opencl/cl_image.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <array>
 #include "paddle/fluid/lite/opencl/cl_engine.h"
-#include "paddle/fluid/lite/opencl/cl_half.h"
 #include "paddle/fluid/lite/opencl/cl_tool.h"

 namespace paddle {
@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
  int width = cl_image.image_dims_[0];
  int height = cl_image.image_dims_[1];

-  half_t* image_data = new half_t[height * width * 4];
+  float* image_data = new float[height * width * 4];
  cl::Image* image = cl_image.cl_image();
  const std::array<size_t, 3> origin{0, 0, 0};
  const std::array<size_t, 3> region{static_cast<size_t>(width),
@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context,
  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);

 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  half_t* image_data = new half_t[image_dims_.product() * 4];
+  float* image_data = new float[image_dims_.product() * 4];
 #else
-  half_t* image_data = new half_t[image_dims_.production() * 4];
+  float* image_data = new float[image_dims_.production() * 4];
 #endif

  VLOG(3) << " convert to image ";
@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context,

 void CLImage::InitCLImage(const cl::Context& context, int width, int height,
                          void* data) {
-  cl::ImageFormat img_format(CL_RGBA, CL_HALF_FLOAT);
+  cl::ImageFormat img_format(CL_RGBA, CL_FLOAT);
  cl_int err;
  cl_image_.reset(new cl::Image2D(
      context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),

--- a/paddle/fluid/lite/opencl/cl_image_converter.cc
+++ b/paddle/fluid/lite/opencl/cl_image_converter.cc
@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
                                     static_cast<DDim::value_type>(height)}));
 }

-void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+void CLImageConverterDefault::NCHWToImage(float *nchw, float *image,
                                          const DDim &tensor_dim) {
  size_t new_dims[] = {1, 1, 1, 1};
  for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
          if (c < C) {
            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
            // (c % 4);
-            image[i2] = Float2Half(*p);
+            image[i2] = *p;
            i2 += 4;
            p++;
          } else {
@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
  }
 }

-void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterDefault::ImageToNCHW(float *image, float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
  size_t new_dims[] = {1, 1, 1, 1};
@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
      for (size_t h = 0; h < H; h++) {
        size_t i2 = (i1 << 2) + c % 4;
        for (size_t w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
+          *p = image[i2];
          i2 += 4;
          p++;
        }
@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
  }
 }

-void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+void CLImageConverterFolder::NCHWToImage(float *tensor, float *image,
                                         const DDim &tensor_dim) {
  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
      << " Tensor dim is not support!";
@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,

    for (size_t h = 0; h < tdim[0]; h++) {
      for (size_t w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] =
-            Float2Half(tensor[h * tdim[1] + w]);
+        image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
      }
    }
  }
 }

-void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterFolder::ImageToNCHW(float *image, float *tensor,
                                         const DDim &image_dim,
                                         const DDim &tensor_dim) {
  if (tensor_dim.size() > 2) {
@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,

    for (size_t h = 0; h < H; h++) {
      for (size_t w = 0; w < W; w++) {
-        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+        p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
      }
    }
  }
@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
                                     static_cast<DDim::value_type>(height)}));
 }

-void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, float *image,
                                          const DDim &tensor_dim) {
  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
  auto image_dim = InitImageDimInfoWith(tensor_dim);
@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                         w * 4 + n % 4;
          if (n < N) {
-            image[index] = Float2Half(*p);
+            image[index] = *p;
            p++;
          } else {
            image[index] = 0.0;
@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
  VLOG(3) << " init done";
 }

-void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterNWBlock::ImageToNCHW(float *image, float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
        for (size_t w = 0; w < W; ++w) {
          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                         w * 4 + n % 4;
-          *p = Half2Float(image[index]);
+          *p = image[index];
          p++;
          if (index >= (width * height * 4)) {
            LOG(INFO) << " index out of range ";
@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
                                     static_cast<DDim::value_type>(height)}));
 }

-void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, float *image,
                                          const DDim &tensor_dim) {
  size_t new_dims[] = {1, 1, 1, 1};
  for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
          if (c < C) {
            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
            // (c % 4);
-            image[i2] = Float2Half(*p);
+            image[i2] = *p;
            i2 += 4;
            p++;
          } else {
@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
  }
 }

-void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterDWBlock::ImageToNCHW(float *image, float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
      for (size_t h = 0; h < H; h++) {
        size_t i2 = (i1 << 2) + c % 4;
        for (size_t w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
+          *p = image[i2];
          i2 += 4;
          p++;
        }
@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
                                     static_cast<DDim::value_type>(height)}));
 }

-void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
+void CLImageConverterNormal::NCHWToImage(float *tensor, float *image,
                                         const DDim &tensor_dim) {
  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
      << " Tensor dim is not support!";
@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
  default_converter.NCHWToImage(tensor, image, tensor_dim);
 }

-void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterNormal::ImageToNCHW(float *image, float *tensor,
                                         const DDim &image_dim,
                                         const DDim &tensor_dim) {
  CLImageConverterDefault default_converter;
@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
                                     static_cast<DDim::value_type>(height)}));
 }

-void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
+void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, float *image,
                                                  const DDim &tensor_dim) {}

-void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
+void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, float *tensor,
                                                  const DDim &image_dim,
                                                  const DDim &tensor_dim) {}


--- a/paddle/fluid/lite/opencl/cl_image_converter.h
+++ b/paddle/fluid/lite/opencl/cl_image_converter.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once

 #include "paddle/fluid/lite/core/compatible_tensor.h"
-#include "paddle/fluid/lite/opencl/cl_half.h"

 namespace paddle {
 namespace lite {
@@ -24,10 +23,10 @@ class CLImageConverterBase {
 public:
  virtual ~CLImageConverterBase() {}

-  virtual void NCHWToImage(float *nchw, half_t *image,
+  virtual void NCHWToImage(float *nchw, float *image,
                           const DDim &tensor_dim) = 0;

-  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+  virtual void ImageToNCHW(float *image, float *nchw, const DDim &image_dim,
                           const DDim &tensor_dim) = 0;
  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
 };
@@ -35,16 +34,16 @@ class CLImageConverterBase {
 class CLImageConverterDefault : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);
 };

 class CLImageConverterFolder : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);

  /*
@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase {
 class CLImageConverterNormal : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);

  /*
@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase {

 class CLImageConverterNWBlock : public CLImageConverterBase {
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);
 };
 class CLImageConverterDWBlock : public CLImageConverterBase {
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);
 };

 class CLImageConverterWinoTransWeight : public CLImageConverterBase {
 public:
  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+  void NCHWToImage(float *tensor, float *image, const DDim &tensor_dim);
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
                   const DDim &tensor_dim);
 };


--- a/paddle/fluid/lite/opencl/cl_half.h
+++ b/paddle/fluid/lite/opencl/cl_half.h
@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
-#include <cstdint>
-
-namespace paddle {
-namespace lite {
-
-typedef uint16_t half_t;
-
-half_t Float2Half(float f);
-
-float Half2Float(half_t h);
-
-void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
-
-void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
-
-}  // namespace lite
-}  // namespace paddle
+__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     int2 coords_bias;
+     coords_bias.x = x/w;
+     coords_bias.y = 0;
+     float4 in = read_imagef(input, sampler, coords);
+     float4 biase = read_imagef(bias, sampler, coords_bias);
+     float4 output = in + biase;
+     write_imagef(outputImage, coords, output);
+ }
--- a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
@@ -14,21 +14,19 @@ limitations under the License. */

 #pragma once

-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-inline half4 activation(half4 in
+inline float4 activation(float4 in
 #ifdef PRELU
-                        ,
-                        half4 prelu_alpha
+                         ,
+                         float4 prelu_alpha
 #endif
-                        ) {
-  half4 output;
+                         ) {
+  float4 output;
 #ifdef PRELU
-  output = select(prelu_alpha * in, in, in >= (half4)0.0);
+  output = select(prelu_alpha * in, in, in >= (float4)0.0);
 #endif

 #ifdef RELU
-  output = fmax(in, (half4)(0.0f));
+  output = fmax(in, (float4)(0.0f));
 #endif
  return output;
 }
--- a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
+__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
     int x = get_global_id(0);
     int y = get_global_id(1);
     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
     int2 coords;
     coords.x = x;
     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in + biase;
-     write_imageh(outputImage,coords,output);
+     float4 in = read_imagef(input, sampler, coords);
+     float4 biase = read_imagef(bias, sampler, coords);
+     float4 output = in + biase;
+     write_imagef(outputImage,coords,output);
 }
--- a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
+++ b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define MIN_VALUE -FLT_MAX

 __kernel void pool_max(
@@ -41,16 +40,16 @@ __kernel void pool_max(

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
-  half4 max_value = (half4)(MIN_VALUE);
+  float4 max_value = (float4)(MIN_VALUE);
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
-      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
      max_value = max(max_value, tmp);
    }
  }

  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
+  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
 }

 __kernel void pool_avg(
@@ -77,15 +76,15 @@ __kernel void pool_avg(

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
-  half4 sum = (half4)(0.0f);
+  float4 sum = (float4)(0.0f);
  int num = 0;
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
-      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
      num++;
    }
  }
-  half4 avg = sum / num;
+  float4 avg = sum / num;
  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
+  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
 }
--- a/paddle/fluid/lite/opencl/cl_test.cc
+++ b/paddle/fluid/lite/opencl/cl_test.cc
@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) {
  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
  auto kernel = helper->GetKernel(2);

-  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
-  for (int i = 0; i < 1024 * 512; i++) {
+  std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
+  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
    in_data[i] = 1.f;
  }
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
  CLImage in_image;
  in_image.set_tensor_data(in_data.get(), in_dim);
  in_image.InitNormalCLImage(helper->OpenCLContext());
  LOG(INFO) << in_image;

-  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
-  for (int i = 0; i < 1024 * 512; i++) {
+  std::unique_ptr<float[]> bias_data(new float[4 * 3 * 256 * 512]);
+  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
    bias_data[i] = 2.f;
  }
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
  CLImage bias_image;
  bias_image.set_tensor_data(bias_data.get(), bias_dim);
  bias_image.InitNormalCLImage(helper->OpenCLContext());
  LOG(INFO) << bias_image;

  CLImage out_image;
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
  LOG(INFO) << out_image;

@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) {
  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
  CL_CHECK_ERRORS(status);
-
+  status = helper->OpenCLCommandQueue().finish();
+  CL_CHECK_ERRORS(status);
  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) {
  LOG(INFO) << out_image;
 }

-TEST(cl_test, elementwise_add_test) {
+TEST(cl_test, channel_add_test) {
  std::default_random_engine engine;
  std::uniform_real_distribution<float> dist(-5, 5);

-  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
-  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
-  for (int i = 0; i < 1024 * 512; i++) {
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
    in_data[i] = dist(engine);
  }

-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
-  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
-  for (int i = 0; i < 1024 * 512; i++) {
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
+  std::unique_ptr<float[]> bias_data(new float[16]);
+  for (int i = 0; i < 16; i++) {
    bias_data[i] = dist(engine);
  }

-  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
-  std::unique_ptr<float[]> out(new float[1024 * 512]);
+  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 16; j++) {
+      float b = bias_data[j];
+      for (int k = 0; k < 256 * 512; k++) {
+        int index = (i * 16 + j) * 256 * 512 + k;
+        out_ref[index] = in_data[index] + b;
+      }
+    }
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);

  bool status = InitOpenCLEngine(FLAGS_cl_path);
  CHECK(status) << "Fail to initialize OpenCL engine.";
-  CLContext context;
+  std::unique_ptr<CLContext> context(new CLContext);
+  std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("channel_add", "channel_add_kernel.cl");
+  elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
+                  bias_dim, out.get(), out_dim);
+
+  int stride = 4 * 16 * 256 * 512 / 20;
+  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }

-  elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
-                  out.get(), out_dim);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
+  }

-  int stride = 1024 * 512 / 20;
-  for (int i = 0; i < 1024 * 512; i += stride) {
+  std::cout << std::endl;
+}
+
+TEST(cl_test, elementwise_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    bias_data[i] = dist(engine);
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    out_ref[i] = in_data[i] + bias_data[i];
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  std::unique_ptr<CLContext> context(new CLContext);
+  std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("channel_add", "channel_add_kernel.cl");
+  elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
+                  bias_dim, out.get(), out_dim);
+
+  int stride = 4 * 16 * 256 * 512 / 20;
+  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
    std::cout << out[i] << " ";
  }

+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
+  }
+
  std::cout << std::endl;
 }