Merge pull request #1070 from codeWorm2015/opencl

commit cl code

Merge pull request #1070 from codeWorm2015/opencl
commit cl code
b054affa · Ray Liu · GitHub · 84f34a23 · c2062894 · b054affa
20 changed file
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <chrono>

+namespace paddle_mobile {
+
 using Time = decltype(std::chrono::high_resolution_clock::now());

 inline Time time() { return std::chrono::high_resolution_clock::now(); }
@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
  ms counter = std::chrono::duration_cast<ms>(diff);
  return counter.count() / 1000.0;
 }
+
+}
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <string>

 #include "CL/cl.h"
-#include "common/log.h"
 #include "common/enforce.h"
+#include "common/log.h"
 #include "framework/cl/cl_deleter.h"
 #include "framework/cl/cl_tool.h"


--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
@@ -14,20 +14,20 @@ limitations under the License. */

 #include "cl_image.h"
 namespace paddle_mobile {
-    namespace framework {
-        void CLImageToTensor(CLImage *cl_image, Tensor *tensor,cl_command_queue commandQueue){
-
+namespace framework {
+void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
+                     cl_command_queue commandQueue) {
  DDim ddim = cl_image->dims();
-            size_t N,C,H,W;
-            if(ddim.size()==4){
+  size_t N, C, H, W;
+  if (ddim.size() == 4) {
    N = ddim[0];
-                if(N<0){
+    if (N < 0) {
      N = 1;
    }
    C = ddim[1];
    H = ddim[2];
    W = ddim[3];
-            }else if(ddim.size()==1){
+  } else if (ddim.size() == 1) {
    N = 1;
    C = ddim[0];
    H = 1;
@@ -41,15 +41,16 @@ namespace paddle_mobile {
  half imageData[width * height * 4];
  cl_int err;
  cl_mem image = cl_image->GetCLImage();
-            size_t origin[3] = {0,0,0};
-            size_t region[3] = {width,height,1};
-            err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
+                           imageData, 0, NULL, NULL);
  size_t i0 = 0;
  for (int n = 0; n < N; n++) {
    for (int c = 0; c < C; c++) {
      size_t i1 = i0;
      for (int h = 0; h < H; h++) {
-                            size_t i2 = (i1<<2) + c % 4;
+        size_t i2 = (i1 << 2) + c % 4;
        for (int w = 0; w < W; w++) {
          *p = half2float(imageData[i2]);
          i2 += 4;
@@ -61,25 +62,23 @@ namespace paddle_mobile {
    i0 += width * H;
  }

-
  if (err != CL_SUCCESS) {
    // TODO: error handling
  }
-
-        }
-        void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,cl_command_queue commandQueue){
-
+}
+void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
+                     cl_command_queue commandQueue) {
  DDim ddim = cl_image->dims();
-            size_t N,C,H,W;
-            if(ddim.size()==4){
+  size_t N, C, H, W;
+  if (ddim.size() == 4) {
    N = ddim[0];
-                if(N<0){
+    if (N < 0) {
      N = 1;
    }
    C = ddim[1];
    H = ddim[2];
    W = ddim[3];
-            }else if(ddim.size()==1){
+  } else if (ddim.size() == 1) {
    N = 1;
    C = ddim[0];
    H = 1;
@@ -92,10 +91,11 @@ namespace paddle_mobile {
  const float *p = tensor->data<float>();
  half imageData[width * height * 4];
  cl_mem image = cl_image->GetCLImage();
-            size_t origin[3] = {0,0,0};
-            size_t region[3] = {width,height,1};
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
  cl_int err;
-            err = clEnqueueReadImage(commandQueue,image,CL_TRUE,origin,region,0,0,imageData,0,NULL,NULL);
+  err = clEnqueueReadImage(commandQueue, image, CL_TRUE, origin, region, 0, 0,
+                           imageData, 0, NULL, NULL);
  if (err != CL_SUCCESS) {
    // TODO: error handling
  }
@@ -104,7 +104,7 @@ namespace paddle_mobile {
    for (int c = 0; c < C; c++) {
      size_t i1 = i0;
      for (int h = 0; h < H; h++) {
-                            size_t i2 = (i1<<2) + c % 4;
+        size_t i2 = (i1 << 2) + c % 4;
        for (int w = 0; w < W; w++) {
          imageData[i2] = float2half(*p);
          i2 += 4;
@@ -115,9 +115,6 @@ namespace paddle_mobile {
    }
    i0 += width * H;
  }
-
-
-        }
-    }
 }
-
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -28,8 +28,93 @@ class CLImage {
 public:
  CLImage() = default;

-  void Init(cl_context context, float *tensorInput, DDim ddim) {
-    tensor_dims_ = ddim;
+  /*
+   * will not hold input tensor data, memcpy in this method
+   * */
+  void SetTensorData(float *tensorData, const DDim &dim) {
+    int numel = product(dim);
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+    }
+    tensor_data_ = new float[numel];
+    memcpy(tensor_data_, tensorData, numel);
+    tensor_dims_ = dim;
+  }
+
+  /*
+   * need call SetTensorData first
+   * */
+  void InitCLImage(cl_context context) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    InitCLImage(context, tensor_data_, tensor_dims_);
+    delete[](tensor_data_);
+    tensor_data_ = nullptr;
+    initialized_ = true;
+  }
+
+  void InitEmptyImage(cl_context context, const DDim &dim) {
+    if (tensor_data_ != nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " empty image tensor data shouldn't have value");
+    }
+    InitCLImage(context, nullptr, dim);
+    initialized_ = true;
+  }
+
+  cl_mem GetCLImage() const { return cl_image_; }
+
+  const DDim &ImageDims() { return image_dims_; }
+
+  inline size_t ImageWidth() const { return image_width_; }
+
+  inline size_t ImageHeight() const { return image_height_; }
+
+  /*
+   * block of channels, 4 channel one block
+   * */
+  inline size_t CBlock() const { return c_block_; }
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  /*
+   *  resize original tensor dim
+   * */
+  inline CLImage &Resize(const DDim &dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  T *data() const {
+    if (initialized_) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " cl image has initialized, tensor data has been deleted ");
+    }
+    return reinterpret_cast<T *>(tensor_data_);
+  }
+
+  /*
+   *  numel of tensor dim
+   * */
+  inline int64_t numel() const { return product(tensor_dims_); }
+
+  /*
+   *  original tensor dim
+   * */
+  const DDim &dims() const { return tensor_dims_; }
+
+ private:
+  void InitCLImage(cl_context context, float *tensor_data, const DDim &dim) {
    cl_image_format cf = {.image_channel_order = CL_RGBA,
                          .image_channel_data_type = CL_HALF_FLOAT};
    // NCHW -> [W * (C+3)/4, H * N]
@@ -62,12 +147,13 @@ class CLImage {

    image_width_ = width;
    image_height_ = height;
+    image_dims_ = make_ddim({image_width_, image_height_});

    std::unique_ptr<half_t[]> imageData{};
    int count = 0;
-    if (tensorInput != nullptr) {
+    if (tensor_data != nullptr) {
      imageData.reset(new half_t[width * height * 4]);
-      float *p = tensorInput;
+      float *p = tensor_data;
      size_t i0 = 0;
      for (int n = 0; n < N; n++) {
        for (int c = 0; c < C; c++) {
@@ -108,39 +194,8 @@ class CLImage {
      // TODO(HaiPeng): error handling
      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
    }
-
-    initialized_ = true;
-  }
-
-  void Init(cl_context context, DDim ddim) { Init(context, nullptr, ddim); }
-
-  inline CLImage &Resize(const DDim &dims) {
-    tensor_dims_ = dims;
-    return *this;
-  }
-
-  const DDim &dims() const { return tensor_dims_; }
-
-  cl_mem GetCLImage() const { return cl_image_; }
-
-  template <typename T>
-  T *data() const {
-    return reinterpret_cast<T *>(tensor_input_);
  }

-  inline int64_t numel() const { return product(tensor_dims_); }
-
-  inline size_t ImageWidth() const { return image_width_; }
-
-  inline size_t ImageHeight() const { return image_height_; }
-
-  inline size_t CBlock() const { return c_block_; }
-
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
- private:
  bool initialized_ = false;
  cl_mem cl_image_;
  size_t image_width_;
@@ -149,7 +204,8 @@ class CLImage {
  size_t image_height_;
  size_t c_block_;
  DDim tensor_dims_;
-  float *tensor_input_;
+  DDim image_dims_;
+  float *tensor_data_;
  cl_context context_;
 };


--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -56,7 +56,8 @@ class CLScope {
    auto program = CLEngine::Instance()->CreateProgramWith(
        context_.get(), "./cl_kernel/" + file_name);

-    status_ = clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
+    status_ =
+        clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0);
    CL_CHECK_ERRORS(status_);

    programs_[file_name] = std::move(program);

--- a/src/framework/cl/cl_tool.h
+++ b/src/framework/cl/cl_tool.h
@@ -26,7 +26,8 @@ const char* opencl_error_to_str(cl_int error);
    printf(                                                           \
        "OpenCL error with code %s happened in file %s at line %d. "  \
        "Exiting.\n",                                                 \
-        opencl_error_to_str(ERR), __FILE__, __LINE__);               \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
+        __LINE__);                                                    \
  }

 }  // namespace framework

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -928,7 +928,8 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {

        framework::DDim ddim = framework::make_ddim(desc.Dims());

-        cl_image->Init(context, tensorInput, ddim);
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);

        delete origin_data;
        paddle_mobile::memory::Free(tensorInput);
@@ -941,7 +942,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
          framework::DDim ddim = cl_image->dims();
          DLOG << var_desc->Name();
-          cl_image->Init(context, ddim);
+          cl_image->InitEmptyImage(context, ddim);
        }
      }
    }
@@ -982,7 +983,10 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
-        cl_image->Init(context, tensorInput, ddim);
+
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+
        paddle_mobile::memory::Free(tensorInput);
      } else {
        auto cl_image = var->template GetMutable<framework::CLImage>();
@@ -991,8 +995,7 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
        const framework::TensorDesc &desc = var_desc->Tensor_desc();
        framework::DDim ddim = cl_image->dims();
        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
-
-        cl_image->Init(context, ddim);
+        cl_image->InitEmptyImage(context, ddim);
      }
    }
  }

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
@@ -18,9 +18,10 @@ limitations under the License. */

 inline hafl4 activation(half4 in
 #ifdef PRELU
-                ,half4 prelu_alpha
+                        ,
+                        half4 prelu_alpha
 #endif
-                ) {
+) {
  half4 output;
 #ifdef PRELU
  output = select(prelu_alpha * in, in, in >= (half4)0.0);
@@ -31,4 +32,3 @@ inline hafl4 activation(half4 in
 #endif
  return output;
 }
-
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/conv_add_bn_relu_kernel.h"
 #include "framework/cl/cl_image.h"
+#include "framework/cl/cl_tool.h"

 namespace paddle_mobile {
 namespace operators {
@@ -56,15 +57,15 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(

  framework::CLImage *new_scale = new framework::CLImage();

-  new_scale->Init(this->cl_helper_.CLContext(), new_scale_ptr,
-                  variance->dims());
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext());

  framework::CLImage *new_bias = new framework::CLImage();

-  new_bias->Init(this->cl_helper_.CLContext(), new_bias_ptr, variance->dims());
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext());

  param->SetNewScale(new_scale);
-
  param->SetNewBias(new_bias);

  PADDLE_MOBILE_ENFORCE(
@@ -115,26 +116,32 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();

-  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  clSetKernelArg(kernel, 1, sizeof(int), &w);
-  clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-  clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-  clSetKernelArg(kernel, 9, sizeof(int), &stride);
-  clSetKernelArg(kernel, 10, sizeof(int), &offset);
-  clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-  clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-  clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-  clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-  clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-  clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+
+  CL_CHECK_ERRORS(status);
+
+  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                             default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
 }

 template class ConvAddBNReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -65,24 +65,31 @@ void ConvAddKernel<GPU_CL, float>::Compute(
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();

-  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  clSetKernelArg(kernel, 1, sizeof(int), &w);
-  clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-  clSetKernelArg(kernel, 7, sizeof(int), &stride);
-  clSetKernelArg(kernel, 8, sizeof(int), &offset);
-  clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-  clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-  clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-  clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-  clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-  clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  cl_int status;

+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+
+  CL_CHECK_ERRORS(status);
+
+  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                             default_work_size.data(), NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
 }

 template class ConvAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -21,63 +21,69 @@ namespace operators {

 template <>
 bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  //  PADDLE_MOBILE_ENFORCE(
-  //      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-  //          param->Paddings()[0] == param->Paddings()[1],
-  //      "need equal");
-  //  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-  //               static_cast<int>(param->Paddings()[1]);
-  //  param->SetOffset(offset);
-  //
-  //  if (param->Filter()->WidthOfOneBlock() == 1 &&
-  //      param->Filter()->HeightOfOneBlock() == 1) {
-  //    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
-  //  } else if (param->Filter()->dims()[1] == 1) {
-  //    this->cl_helper_.AddKernel("depth_conv_3x3",
-  //    "conv_add_bn_relu_kernel.cl");
-  //  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
-  //             param->Filter()->HeightOfOneBlock() == 3) {
-  //    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
-  //  } else {
-  //    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  //  }
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->WidthOfOneBlock() == 1 &&
+      param->Filter()->HeightOfOneBlock() == 1) {
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1) {
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else if (param->Filter()->WidthOfOneBlock() == 3 &&
+             param->Filter()->HeightOfOneBlock() == 3) {
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl");
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }

  return true;
 }

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  //  auto kernel = this->cl_helper_.KernelAt(0);
-  //  auto default_work_size =
-  //  this->cl_helper_.DefaultWorkSize(*param.Output()); int c_block =
-  //  default_work_size[0]; int w = default_work_size[1]; int nh =
-  //  default_work_size[2]; auto input = param.Input()->GetCLImage(); auto
-  //  filter = param.Filter()->GetCLImage(); auto output = param.Output(); int
-  //  stride = param.Strides()[0]; int offset = param.Offset(); int input_c =
-  //  param.Input()->CBlock(); int dilation = param.Dilations()[0]; int
-  //  input_width = param.Input()->WidthOfOneBlock(); int input_height =
-  //  param.Input()->HeightOfOneBlock();
-  //
-  //  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  //  clSetKernelArg(kernel, 1, sizeof(int), &w);
-  //  clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  //  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  //  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  //  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  //  clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  //  clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  //  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  //  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  //  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  //  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  //
-  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-  //                         default_work_size.data(), NULL, 0, NULL, NULL);
-
-  //  auto kernel = this->cl_helper_.KernelAt(0);
-  //  size_t global_work_size[3] = {1, 2, 3};
-  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-  //  global_work_size, NULL, 0, NULL, NULL);
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = param.Input()->CBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->WidthOfOneBlock();
+  int input_height = param.Input()->HeightOfOneBlock();
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+
+  CL_CHECK_ERRORS(status);
+
+  status =
+      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                             default_work_size.data(), NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
@@ -36,7 +36,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
 }

 template <>
-void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
+void DepthwiseConvKernel<GPU_CL, float>::Compute(
+    const ConvParam<GPU_CL> &param) {
  auto kernel = this->cl_helper_.KernelAt(0);
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
  int c_block = default_work_size[0];
@@ -54,23 +55,30 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param)
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();

-  clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  clSetKernelArg(kernel, 1, sizeof(int), &w);
-  clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  cl_int status;

+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+
+  CL_CHECK_ERRORS(status);
+
+  status =
      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                             default_work_size.data(), NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
 }

 template class DepthwiseConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -12,42 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "common/log.h"
 #include "operators/kernel/feed_kernel.h"
+#include "common/log.h"

 namespace paddle_mobile {
-    namespace operators {
+namespace operators {

-        template <>
-        bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
-         DLOG<<"Init feed";
+template <>
+bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
+  DLOG << "Init feed";
  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
  return true;
-        }
-
-        template <>
-        void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
+}

-         DLOG<<"feed_kernel";
+template <>
+void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
+  DLOG << "feed_kernel";
  auto kernel = this->cl_helper_.KernelAt(0);
  cl_int status;
  auto output = param.Out();
  auto input = param.InputX();
+  DLOG << " input: " << input;
+
  const float *input_data = input->data<float>();
  cl_mem cl_image = output->GetCLImage();
  int height = output->dims()[2];
  int width = output->dims()[3];
-         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&input_data);
-         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&cl_image);
-         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&width);
-         status = clSetKernelArg(kernel,0, sizeof(cl_mem),&height);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_data);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_image);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &width);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &height);

-         size_t global_work_size[2] = {height,width};
-         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
-        }
+  size_t global_work_size[2] = {height, width};
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+                         global_work_size, NULL, 0, NULL, NULL);
+}

-        template class FeedKernel<GPU_CL, float>;
+template class FeedKernel<GPU_CL, float>;

-    }  // namespace operators
+}  // namespace operators
 }  // namespace paddle_mobile
-
--- a/src/operators/kernel/cl/relu_kernel.cpp
+++ b/src/operators/kernel/cl/relu_kernel.cpp
@@ -19,13 +19,13 @@ namespace paddle_mobile {
 namespace operators {

 template <>
-bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL> *param) {
+bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
  this->cl_helper_.AddKernel("relu", "relu.cl");
  return true;
 }

 template <>
-void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) {
+void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
  auto kernel = this->cl_helper_.KernelAt(0);
  const auto* input = param.InputX();
  auto* output = param.Out();
@@ -34,7 +34,7 @@ void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL> &param) {
  auto outputImage = output->GetCLImage();
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  const size_t work_size[2] = { input->ImageWidth(), input->ImageHeight() };
+  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         work_size, NULL, 0, NULL, NULL);
 }

--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
@@ -25,30 +25,29 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {

 template <>
 void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
-auto kernel = this->cl_helper_.KernelAt(0);
-  const auto * input = param.InputX();
-  auto * output = param.Out();
+  auto kernel = this->cl_helper_.KernelAt(0);
+  const auto *input = param.InputX();
+  auto *output = param.Out();
  auto inputImage = input->GetCLImage();
  auto outputImage = output->GetCLImage();
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  const auto & inputDim = input->dims();
-  const auto & outputDim = output->dims();
+  const auto &inputDim = input->dims();
+  const auto &outputDim = output->dims();
  int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
  int odims[4] = {outputDim[0], outputDim[1], outputDim[2], outputDim[3]};
  clSetKernelArg(kernel, 2, sizeof(int), dims);
-  clSetKernelArg(kernel, 3, sizeof(int), dims+1);
-  clSetKernelArg(kernel, 4, sizeof(int), dims+2);
-  clSetKernelArg(kernel, 5, sizeof(int), dims+3);
+  clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
+  clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
+  clSetKernelArg(kernel, 5, sizeof(int), dims + 3);
  clSetKernelArg(kernel, 6, sizeof(int), odims);
-  clSetKernelArg(kernel, 7, sizeof(int), odims+1);
-  clSetKernelArg(kernel, 8, sizeof(int), odims+2);
-  clSetKernelArg(kernel, 9, sizeof(int), odims+3);
-  const size_t work_size[2] = { output->ImageWidth(), output->ImageHeight() };
+  clSetKernelArg(kernel, 7, sizeof(int), odims + 1);
+  clSetKernelArg(kernel, 8, sizeof(int), odims + 2);
+  clSetKernelArg(kernel, 9, sizeof(int), odims + 3);
+  const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};

  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
                         work_size, NULL, 0, NULL, NULL);
-
 }

 template class ReshapeKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
@@ -29,18 +29,18 @@ template <>
 void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
  auto kernel = this->cl_helper_.KernelAt(0);
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  const auto * input = param.InputX();
-  auto * output = param.Out();
+  const auto *input = param.InputX();
+  auto *output = param.Out();
  auto inputImage = input->GetCLImage();
  auto outputImage = output->GetCLImage();
  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  const auto & inputDim = input->dims();
+  const auto &inputDim = input->dims();
  int dims[4] = {inputDim[0], inputDim[1], inputDim[2], inputDim[3]};
  clSetKernelArg(kernel, 2, sizeof(int), dims);
-  clSetKernelArg(kernel, 3, sizeof(int), dims+1);
-  clSetKernelArg(kernel, 4, sizeof(int), dims+2);
-  clSetKernelArg(kernel, 5, sizeof(int), dims+3);
+  clSetKernelArg(kernel, 3, sizeof(int), dims + 1);
+  clSetKernelArg(kernel, 4, sizeof(int), dims + 2);
+  clSetKernelArg(kernel, 5, sizeof(int), dims + 3);

  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);

--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
@@ -18,15 +18,15 @@ limitations under the License. */
 #include "operators/op_param.h"

 namespace paddle_mobile {
-    namespace operators {
-        using namespace framework;
-        template <typename DeviceType, typename T>
-        class FeedKernel
-                : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>>{
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class FeedKernel
+    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
 public:
  void Compute(const FeedParam<DeviceType> &param);
  bool Init(FeedParam<DeviceType> *param);
-        };
+};

-    }  // namespace operators
+}  // namespace operators
 }  // namespace paddle_mobile
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -29,8 +29,8 @@ int main() {
  bool optimize = true;
  auto time1 = time();
  if (paddle_mobile.Load(g_googlenet, optimize)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -19,14 +19,14 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
  //    paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
+  auto time1 = paddle_mobile::time();
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
  //                     std::string(g_mobilenet_detect) + "/params", true);

  auto isok = paddle_mobile.Load(g_mobilenet, false);
  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" << std::endl;

    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
@@ -42,13 +42,13 @@ int main() {
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
-    auto time3 = time();
+    auto time3 = paddle_mobile::time();
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
    DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+    auto time4 = paddle_mobile::time();
+    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 << "ms"
              << std::endl;
  }