diff --git a/paddle/fluid/lite/opencl/CMakeLists.txt b/paddle/fluid/lite/opencl/CMakeLists.txt
index 00c0e35b07430fe6fb0a3e83a6cb4af1f4c6a97b..06072a1067d69d8376c9de2fd31dd452fb79efde 100644
--- a/paddle/fluid/lite/opencl/CMakeLists.txt
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
@@ -7,7 +7,8 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     set_target_properties(opencl-lib
         PROPERTIES
         IMPORTED_LOCATION
-        ${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libOpenCL.so)
+        #${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libOpenCL.so)
+        ${CMAKE_SOURCE_DIR}/opencl-lib/armeabi-v7a/libGLES_mali.so)
 
     cc_library(cl_tool SRCS cl_tool.cc)
     target_compile_options(cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers)
@@ -18,7 +19,8 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
     cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS cl_half lite_tensor)
     cc_library(cl_image SRCS cl_image.cc DEPS cl_half lite_tensor cl_image_converter cl_engine)
-    lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_engine cl_context)
+    cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_helper cl_image)
+    lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller)
     target_link_libraries(test_cl_runtime opencl-lib)
     add_dependencies(cl_tool opencl_clhpp)
 endif()
diff --git a/paddle/fluid/lite/opencl/cl_caller.cc b/paddle/fluid/lite/opencl/cl_caller.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbb970dea2dee5290f6b5c9f8b9c5b410bd6c38d
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_caller.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+static void CopyImageData(const CLImage& cl_image, float* out) {
+  int width = cl_image.image_dims()[0];
+  int height = cl_image.image_dims()[1];
+
+  half_t* image_data = new half_t[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, out, cl_image.image_dims(),
+                         cl_image.tensor_dims());
+
+  delete[] image_data;
+}
+
+bool InitOpenCLEngine(std::string cl_path) {
+  auto* engine = CLEngine::Global();
+  engine->set_cl_path(cl_path);
+  return engine->IsInitSuccess();
+}
+
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim) {
+  CLHelper helper(context);
+  helper.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper.KernelAt(0);
+  CLImage in_image;
+  in_image.set_tensor_data(in, in_dim);
+  in_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias, bias_dim);
+  bias_image.InitNormalCLImage(helper.OpenCLContext());
+  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
+  CLImage out_image;
+  out_image.InitEmptyImage(helper.OpenCLContext(), out_dim);
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  status = helper.OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  CL_CHECK_ERRORS(status);
+
+  VLOG(3) << " --- Out image: " << out_image << " --- ";
+
+  CopyImageData(out_image, out);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_caller.h b/paddle/fluid/lite/opencl/cl_caller.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed9b879fae2ffaece1a8e28b729b578ff19fdb44
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_caller.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+
+namespace paddle {
+namespace lite {
+
+bool InitOpenCLEngine(std::string cl_path);
+void elementwise_add(CLContext* context, float* in, const DDim& in_dim,
+                     float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim);
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_engine.cc b/paddle/fluid/lite/opencl/cl_engine.cc
index 97301ecba281077a23653e8ceac1d6e6e3907aa3..be82ba23cbb47f9597deec89c42714ec2d27025e 100644
--- a/paddle/fluid/lite/opencl/cl_engine.cc
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
@@ -133,10 +133,10 @@ bool CLEngine::InitializePlatform() {
 
 bool CLEngine::InitializeDevice() {
   std::vector<cl::Device> all_devices;
-  status_ = platform_->getDevices(CL_DEVICE_TYPE_DEFAULT, &all_devices);
+  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
   CL_CHECK_ERRORS(status_);
   if (all_devices.empty()) {
-    LOG(ERROR) << "No OpenCL device found!";
+    LOG(ERROR) << "No OpenCL GPU device found!";
     return false;
   }
   device_ = std::make_shared<cl::Device>();
diff --git a/paddle/fluid/lite/opencl/cl_helper.cc b/paddle/fluid/lite/opencl/cl_helper.cc
index 2a886d0b06e92becc30de09c4847b0cca59d4d9b..116828c153da8a3e94d1d6020137d4ff3cee95ef 100644
--- a/paddle/fluid/lite/opencl/cl_helper.cc
+++ b/paddle/fluid/lite/opencl/cl_helper.cc
@@ -21,9 +21,12 @@ limitations under the License. */
 namespace paddle {
 namespace lite {
 
+void CLHelper::set_context(CLContext *context) { context_ = context; }
+
 void CLHelper::AddKernel(const std::string &kernel_name,
                          const std::string &file_name,
                          const std::string &options) {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
   VLOG(3) << " --- begin to add kernel ---";
   auto kernel = context_->GetKernel(kernel_name, file_name, options);
   kernels.emplace_back(std::move(kernel));
@@ -32,16 +35,24 @@ void CLHelper::AddKernel(const std::string &kernel_name,
 
 cl::Kernel &CLHelper::KernelAt(const int index) {
   VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels.size())
+      << "The index must be less than the size of kernels.";
+  CHECK(kernels[index] != nullptr)
+      << "The target kernel pointer cannot be null.";
   return *(kernels[index]);
 }
 
 cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
   return context_->GetCommandQueue();
 }
 
-cl::Context &CLHelper::OpenCLContext() { return context_->GetContext(); }
+cl::Context &CLHelper::OpenCLContext() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetContext();
+}
 
-std::vector<size_t> CLHelper::DefaultWorkSize(const CLImage &image) {
+cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) {
   // n c h w
   auto image_dim = image.tensor_dims();
   if (image_dim.size() == 4) {
@@ -52,23 +63,26 @@ std::vector<size_t> CLHelper::DefaultWorkSize(const CLImage &image) {
     auto work_size_0 = image_width / w;
     auto work_size_1 = w;
     auto work_size_2 = n * h;
-    return {static_cast<size_t>(work_size_0), static_cast<size_t>(work_size_1),
-            static_cast<size_t>(work_size_2)};
+    return cl::NDRange{static_cast<size_t>(work_size_0),
+                       static_cast<size_t>(work_size_1),
+                       static_cast<size_t>(work_size_2)};
   } else if (image_dim.size() == 2) {
-    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
-            static_cast<size_t>(image.ImageHeight())};
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(image.ImageHeight())};
   } else if (image_dim.size() == 1) {
-    return {static_cast<size_t>(1), static_cast<size_t>(image.ImageWidth()),
-            static_cast<size_t>(1)};
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(1)};
   } else if (image_dim.size() == 3) {
     auto c = image_dim[0];
     auto h = image_dim[1];
     auto w = image_dim[2];
-    return {static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
-            static_cast<size_t>(h)};
+    return cl::NDRange{static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
+                       static_cast<size_t>(h)};
   } else {
     LOG(FATAL) << "Not support this dimension, need to be implemented!";
-    return {};
+    return cl::NDRange{};
   }
 }
 
diff --git a/paddle/fluid/lite/opencl/cl_helper.h b/paddle/fluid/lite/opencl/cl_helper.h
index fce1929db5996528f2070df26e38af48daa434ea..f6f89fb6fdac15f05cc61f61ead6ba8fadee4b74 100644
--- a/paddle/fluid/lite/opencl/cl_helper.h
+++ b/paddle/fluid/lite/opencl/cl_helper.h
@@ -30,6 +30,8 @@ class CLHelper {
 
   explicit CLHelper(CLContext *context) : context_(context) {}
 
+  void set_context(CLContext *context);
+
   void AddKernel(const std::string &kernel_name, const std::string &file_name,
                  const std::string &options = "");
 
@@ -39,10 +41,10 @@ class CLHelper {
 
   cl::Context &OpenCLContext();
 
-  std::vector<size_t> DefaultWorkSize(const CLImage &image);
+  cl::NDRange DefaultWorkSize(const CLImage &image);
 
  private:
-  CLContext *context_;
+  CLContext *context_{nullptr};
   std::vector<std::unique_ptr<cl::Kernel>> kernels;
 };
 
diff --git a/paddle/fluid/lite/opencl/cl_image.cc b/paddle/fluid/lite/opencl/cl_image.cc
index 8edcf000b034edd60f0571f5582cdf93a8e687a0..2c551d6cd3342b23c34427fe3532374617619b3d 100644
--- a/paddle/fluid/lite/opencl/cl_image.cc
+++ b/paddle/fluid/lite/opencl/cl_image.cc
@@ -27,12 +27,12 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
   int height = cl_image.image_dims_[1];
 
   half_t* image_data = new half_t[height * width * 4];
-  cl::Image2D& image = cl_image.cl_image();
+  cl::Image* image = cl_image.cl_image();
   const std::array<size_t, 3> origin{0, 0, 0};
   const std::array<size_t, 3> region{static_cast<size_t>(width),
                                      static_cast<size_t>(height), 1};
   cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
-      image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
   CL_CHECK_ERRORS(err);
 
   float* tensor_data = new float[cl_image.numel()];
@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
   return os;
 }
 
-void CLImage::SetTensorData(float* tensor_data, const DDim& dim) {
+void CLImage::set_tensor_data(float* tensor_data, const DDim& dim) {
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
   auto numel = dim.product();
 #else
@@ -65,28 +65,30 @@ void CLImage::SetTensorData(float* tensor_data, const DDim& dim) {
 }
 
 void CLImage::InitCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call "
+                                    "set_tensohelper->DefaultWorkSize(out_"
+                                    "image)r_data first!";
   image_converter_.reset(new CLImageConverterFolder);
   InitCLImage(context, image_converter_.get());
 }
 
 void CLImage::InitNormalCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
   image_converter_.reset(new CLImageConverterNormal);
   InitCLImage(context, image_converter_.get());
 }
 
 void CLImage::InitNImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
   CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterNWBlock());
+  image_converter_.reset(new CLImageConverterNWBlock);
   InitCLImage(context, image_converter_.get());
 }
 
 void CLImage::InitDWImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
   CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterDWBlock());
+  image_converter_.reset(new CLImageConverterDWBlock);
   InitCLImage(context, image_converter_.get());
 }
 
@@ -95,7 +97,7 @@ void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
       << " Empty image tensor data shouldn't have value";
 
   tensor_dims_ = dim;
-  image_converter_.reset(new CLImageConverterNormal());
+  image_converter_.reset(new CLImageConverterNormal);
 
   VLOG(3) << " to get image dims ";
   image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
@@ -123,7 +125,7 @@ void CLImage::InitEmptyWithImageDim(const cl::Context& context,
 
 void CLImage::InitCLImage(const cl::Context& context,
                           CLImageConverterBase* converter) {
-  CHECK(tensor_data_ != nullptr) << " Please call SetTensorData first!";
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
 
   VLOG(3) << " begin init cl image ";
   image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
diff --git a/paddle/fluid/lite/opencl/cl_image.h b/paddle/fluid/lite/opencl/cl_image.h
index 9b827d9818ab16cfa419e149ec6b4fd83fe2130d..627e503168e4ed12ea30137f2c3155bd07f9e062 100644
--- a/paddle/fluid/lite/opencl/cl_image.h
+++ b/paddle/fluid/lite/opencl/cl_image.h
@@ -33,11 +33,11 @@ class CLImage {
   /*
    * Will not hold input tensor data, memcpy in this method.
    * */
-  void SetTensorData(float* tensor_data, const DDim& dim);
+  void set_tensor_data(float* tensor_data, const DDim& dim);
 
   bool IsInit() { return initialized_; }
   /*
-   * Need call SetTensorData first.
+   * Need call set_tensor_data first.
    * Folder when one dim or two dim.
    * */
   void InitCLImage(const cl::Context& context);
@@ -53,7 +53,7 @@ class CLImage {
   void InitEmptyWithImageDim(const cl::Context& context,
                              const DDim& image_dims);
 
-  cl::Image2D& cl_image() const { return *cl_image_; }
+  cl::Image* cl_image() const { return cl_image_.get(); }
 
   const DDim& image_dims() const { return image_dims_; }
 
@@ -63,7 +63,7 @@ class CLImage {
 
   const DDim& tensor_dims() const { return tensor_dims_; }
 
-  /*
+  /*with_da
    * Resize original tensor dim.
    * */
   inline CLImage& Resize(const DDim& dims) {
diff --git a/paddle/fluid/lite/opencl/cl_test.cc b/paddle/fluid/lite/opencl/cl_test.cc
index 0d3d9d7a389fe9885ec4d90e2447aa105425641d..4a4ac965c1191d9fe4407635911d8feef9bf726a 100644
--- a/paddle/fluid/lite/opencl/cl_test.cc
+++ b/paddle/fluid/lite/opencl/cl_test.cc
@@ -12,10 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_caller.h"
 #include "paddle/fluid/lite/opencl/cl_context.h"
 #include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+
+DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
 
 namespace paddle {
 namespace lite {
@@ -23,7 +33,7 @@ namespace lite {
 TEST(cl_test, engine_test) {
   auto* engine = CLEngine::Global();
   CHECK(engine->IsInitSuccess());
-  engine->set_cl_path("/data/local/tmp/opencl");
+  engine->set_cl_path(FLAGS_cl_path);
   engine->platform();
   engine->device();
   engine->command_queue();
@@ -37,11 +47,108 @@ TEST(cl_test, engine_test) {
 TEST(cl_test, context_test) {
   auto* engine = CLEngine::Global();
   CHECK(engine->IsInitSuccess());
-  engine->set_cl_path("/data/local/tmp/opencl");
+  engine->set_cl_path(FLAGS_cl_path);
   CLContext context;
   context.GetKernel("pool_max", "pool_kernel.cl", "");
   context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
   context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
 }
+
+TEST(cl_test, kernel_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  std::unique_ptr<CLContext> context(new CLContext);
+  // std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  std::unique_ptr<CLHelper> helper(new CLHelper);
+  helper->set_context(context.get());
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("pool_max", "pool_kernel.cl");
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper->KernelAt(2);
+
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = 1.f;
+  }
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage in_image;
+  in_image.set_tensor_data(in_data.get(), in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << in_image;
+
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = 2.f;
+  }
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias_data.get(), bias_dim);
+  bias_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << bias_image;
+
+  CLImage out_image;
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  LOG(INFO) << out_image;
+
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+
+  // auto global_work_size = helper->DefaultWorkSize(out_image);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  cl::Event event;
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+  CL_CHECK_ERRORS(status);
+
+  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
+  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+  LOG(INFO) << out_image;
+}
+
+TEST(cl_test, elementwise_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> in_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> bias_data(new float[1024 * 512]);
+  for (int i = 0; i < 1024 * 512; i++) {
+    bias_data[i] = dist(engine);
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1024, 512});
+  std::unique_ptr<float[]> out(new float[1024 * 512]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  CLContext context;
+
+  elementwise_add(&context, in_data.get(), in_dim, bias_data.get(), bias_dim,
+                  out.get(), out_dim);
+
+  int stride = 1024 * 512 / 20;
+  for (int i = 0; i < 1024 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }
+
+  std::cout << std::endl;
+}
+
 }  // namespace lite
 }  // namespace paddle