1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice… (#1773)

* 1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice and exp op. 3.fix sigmoid op. * fix style

1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice… (#1773)
* 1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice and exp op. 3.fix sigmoid op. * fix style
a63d9e9d · Yanzhan Yang · GitHub · 237cf93b · a63d9e9d · a63d9e9d
19 changed file
--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -85,10 +85,18 @@ REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
 #endif
 #endif  // TANH_OP
+#ifdef PADDLE_MOBILE_CPU
 #ifdef LOG_OP
 REGISTER_OPERATOR_CPU(log, ops::LogOp);
 #endif  // LOG_OP
+#endif
 #ifdef LEAKY_RELU_OP
+#ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp);
 #endif  // LEAKY_RELU_OP
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp);
+#endif
+#endif
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -32,7 +32,9 @@ void ConcatOp<Dtype, T>::InferShape() const {
    inputs_dims.push_back(inputs[i]->dims());
  }
-  auto axis = static_cast<size_t>(this->param_.Axis());
+  auto axis = static_cast<size_t>(this->param_.Axis()) -
+              (this->param_.original_output_dims_size_ -
+               this->param_.Out()->dims().size());
  if (n == 1) {
    DLOG << "Warning: concat op have only one input, "

--- a/src/operators/exp_op.cpp
+++ b/src/operators/exp_op.cpp
@@ -30,7 +30,7 @@ REGISTER_OPERATOR_CPU(exp, ops::EXPOp);
 #endif
 #ifdef PADDLE_MOBILE_CL
-// REGISTER_OPERATOR_CL(exp, ops::EXPOp);
+REGISTER_OPERATOR_CL(exp, ops::EXPOp);
 #endif
 #endif
--- a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
@@ -14,116 +14,55 @@ limitations under the License. */
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void concatByC0(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W) {
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-                      int2 input_pos ;
-                      input_pos.x = in_c * out_W + in_w;
-                      input_pos.y = in_nh;
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-                      write_imageh(output_image, input_pos, input);
+__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0,
+                    __read_only image2d_t input_image_1,
+                    __private const int C_0,
+                    __private const int C_1,
+                    __write_only image2d_t output_image,
+                    __private const int out_C,
+                    __private const int out_W) {
+//                      const int in_c = get_global_id(0);
+//                      const int in_w = get_global_id(1);
+//                      const int in_nh = get_global_id(2);
+//
+//                      int2 input_pos ;
+//                      input_pos.x = in_c * out_W + in_w;
+//                      input_pos.y = in_nh;
+//                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+//                                                CLK_ADDRESS_CLAMP |
+//                                                CLK_FILTER_NEAREST;
+//                      half4 input;
+//                      input = read_imageh(input_image, sampler,input_pos);
+//
+//                      write_imageh(output_image, input_pos, input);
 }
-__kernel void concatByC(__read_only image2d_t input_image1,
+__kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0,
-                      __read_only image2d_t input_image2,
+                    __read_only image2d_t input_image_1,
-                      __write_only image2d_t output_image,
+                    __read_only image2d_t input_image_2,
-                      __private const int out_C,
+                    __private const int C_0,
-                      __private const int out_H,
+                    __private const int C_1,
-                      __private const int out_W,
+                    __private const int C_2,
-                      __private const int out_C_Start,
+                    __write_only image2d_t output_image,
-                      __private const int in_W,
+                    __private const int out_C,
-                      __private const int in_H,
+                    __private const int out_W) {
-                      __private const int in_C1,
+//                      const int in_c = get_global_id(0);
-                      __private const int in_C2) {
+//                      const int in_w = get_global_id(1);
+//                      const int in_nh = get_global_id(2);
-                       const int in_c = get_global_id(0);
+//
-                       const int in_w = get_global_id(1);
+//                      int2 input_pos ;
-                       const int in_nh = get_global_id(2);
+//                      input_pos.x = in_c * out_W + in_w;
-                       int out_c1 = (out_C_Start + 3)/4 -1 + in_c;
+//                      input_pos.y = in_nh;
+//                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                       int out_c2 = out_c1 + 1;
+//                                                CLK_ADDRESS_CLAMP |
+//                                                CLK_FILTER_NEAREST;
-                       int2 output_pos1;
+//                      half4 input;
-                       int2 output_pos2;
+//                      input = read_imageh(input_image, sampler,input_pos);
+//
-                       output_pos1.x = out_c1 * out_W + in_w;
+//                      write_imageh(output_image, input_pos, input);
-                       output_pos1.y = in_nh;
-                       output_pos2.x = out_c2 * out_W + in_w;
-                       output_pos2.y = in_nh;
-                       int2 input_pos1;
-                       if(in_c==0){
-                        input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w;
-                       }else{
-                        input_pos1.x = (in_c - 1) * in_W + in_w;
-                       }
-                       input_pos1.y = in_nh;
-                       int2 input_pos2;
-                       input_pos2.x = in_c * in_W + in_w;
-                       input_pos2.y = in_nh;
-                       half4 output1;
-                       half4 output2;
-                       half4 input1;
-                       half4 input2;
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                   CLK_ADDRESS_CLAMP |
-                                                   CLK_FILTER_NEAREST;
-                       if(in_c==0){
-                       input1 = read_imageh(input_image1, sampler,input_pos1);
-                       }else {
-                       input1 = read_imageh(input_image2, sampler,input_pos1);
-                       }
-                       input2 = read_imageh(input_image2, sampler,input_pos2);
-                       output1 = input1;
-                       if(out_C_Start%4==0){
-                        output2 = input2;
-                       }else if(out_C_Start%4==1){
-                        output1.y = input2.x;
-                        output1.z = input2.y;
-                        output1.w = input2.z;
-                        output2.x = input2.w;
-                        output2.y = 0.0f;
-                        output2.z = 0.0f;
-                        output2.w = 0.0f;
-                       }else if(out_C_Start%4==2){
-                       output1.z = input2.x;
-                       output1.w = input2.y;
-                       output2.x = input2.z;
-                       output2.y = input2.w;
-                       output2.z = 0.0f;
-                       output2.w = 0.0f;
-                       }else if(out_C_Start%4==3){
-                       output1.w = input2.x;
-                       output2.x = input2.y;
-                       output2.y = input2.z;
-                       output2.z = input2.w;
-                       output2.w = 0.0f;
-                       }
-                       write_imageh(output_image, output_pos1, output1);
-                       write_imageh(output_image, output_pos2, output2);
 }
 __kernel void concatByH(__read_only image2d_t input_image,
                      __write_only image2d_t output_image,
                      __private const int out_W,

--- a/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) {
+   const int x = get_global_id(0);
+   const int y = get_global_id(1);
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                             CLK_ADDRESS_CLAMP |
+                             CLK_FILTER_NEAREST;
+   half4 in = read_imageh(input, sampler, (int2)(x, y));
+   half4 out;
+   out.x = pow(2.71828182, (float)(in.x));
+   out.y = pow(2.71828182, (float)(in.y));
+   out.z = pow(2.71828182, (float)(in.z));
+   out.w = pow(2.71828182, (float)(in.w));
+   write_imageh(output, (int2)(x, y), out);
+}
--- a/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void leakyrelu(__read_only image2d_t input,
+    __write_only image2d_t output, __private const float alpha, __private const int dims_w) {
+    const int c = get_global_id(0);
+    const int w = get_global_id(1);
+    const int nh = get_global_id(2);
+    int2 input_pos;
+    input_pos.x = c * dims_w + w;
+    input_pos.y = nh;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+    half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
+    half4 output_data;
+    output_data.x = max((float)(in.x), (float)(alpha * (in.x)));
+    output_data.y = max((float)(in.x), (float)(alpha * (in.y)));
+    output_data.z = max((float)(in.x), (float)(alpha * (in.z)));
+    output_data.w = max((float)(in.x), (float)(alpha * (in.w)));
+    write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data);
+}
--- a/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
+                             __private const float scale_h, __private const float scale_w,
+                             __private const int dims_w){
+                             const int c = get_global_id(0);
+                             const int w = get_global_id(1);
+                             const int nh = get_global_id(2);
+                             int2 output_pos;
+                             output_pos.x = c * dims_w + w;
+                             output_pos.y = nh;
+                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                             CLK_ADDRESS_CLAMP |
+                                                             CLK_FILTER_NEAREST;
+                             // uint x = (uint)(output_pos.x / scale_w);
+                             // uint y = (uint)(output_pos.y / scale_h);
+                             // half4 input_data = read_imageh(input, sampler, (int2)(x, y));
+                             // write_imageh(output, (int2)(output_pos.x , output_pos.y ), input_data);
+}
--- a/src/operators/kernel/cl/cl_kernel/sigmoid.cl
+++ b/src/operators/kernel/cl/cl_kernel/sigmoid.cl
@@ -17,14 +17,18 @@ limitations under the License. */
 __kernel void sigmoid(__read_only image2d_t input,
                   __write_only image2d_t output){
-  const int x = get_global_id(0);
+   const int x = get_global_id(0);
-  const int y = get_global_id(1);
+   const int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
+                             CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+                             CLK_FILTER_NEAREST;
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
+   half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = 1.0f / (1 + exp(-in));
+   half4 out;
-  write_imageh(output, (int2)(x, y), in);
+   out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
+   out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
+   out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
+   out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
+   write_imageh(output, (int2)(x, y), out);
 }
\ No newline at end of file
--- a/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
+                    __private const int start, __private const int end,
+                    __private const int dims_w){
+                    const int c = get_global_id(0);
+                    const int w = get_global_id(1);
+                    const int nh = get_global_id(2);
+                    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                    int2 output_pos;
+                    output_pos.x = c * dims_w + w;
+                    output_pos.y = nh;
+                    int2 input_pos;
+                    half4 input_data;
+                    half4 output_data;
+                    if (start % 4 == 0) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data = input_data;
+                    } else if (start % 4 == 1) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.x = input_data.y;
+                        output_data.y = input_data.z;
+                        output_data.z = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.w = input_data.x;
+                    } else if (start % 4 == 2) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.x = input_data.z;
+                        output_data.y = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.z = input_data.x;
+                        output_data.w = input_data.y;
+                    } else if (start % 4 == 3) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.x = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = read_imageh(input, sampler,input_pos);
+                        output_data.y = input_data.x;
+                        output_data.z = input_data.y;
+                        output_data.w = input_data.z;
+                    }
+                    write_imageh(output, output_pos, output_data);
+}
--- a/src/operators/kernel/cl/concat_kernel.cpp
+++ b/src/operators/kernel/cl/concat_kernel.cpp
@@ -23,16 +23,20 @@ template <>
 bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
  if (param->Out()->dims().size() < 4) {
    this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
-  } else if (param->Out()->dims().size() == 4) {
+  } else if (param->Out()->dims().size() >= 4) {
-    this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl");
+    if (param->Inputs().size() == 2) {
-    this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl");
+      this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl");
+    } else if (param->Inputs().size() == 3) {
+      this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl");
+    } else {
+      return false;
+    }
  }
  return true;
 }
 template <>
 void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
-  DLOG << "yangfei50";
  DLOG << param.Out()->dims();
  if (param.Out()->dims().size() < 4) {
    auto kernel = this->cl_helper_.KernelAt(0);
@@ -69,74 +73,60 @@ void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
    }
  } else {
    auto kernel0 = this->cl_helper_.KernelAt(0);
-    auto kernel1 = this->cl_helper_.KernelAt(1);
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
    auto inputs = param.Inputs();
-    auto *output_image = param.Out()->GetCLImage();
+    int arg_offset;
-    int out_C_Start = 0;
-    auto input_image = inputs[0]->GetCLImage();
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]);
-    int out_W = param.Out()->dims()[3];
    cl_int status;
-    status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image);
+    if (inputs.size() == 2) {
-    CL_CHECK_ERRORS(status);
+      auto input_image_0 = inputs[0]->GetCLImage();
-    status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image);
+      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-    out_C_Start += inputs[0]->dims()[1];
-    for (int i = 1; i < inputs.size(); i++) {
-      auto input_image1 = inputs[i - 1]->GetCLImage();
-      auto input_image2 = inputs[i]->GetCLImage();
-      default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
-      int out_C = param.Out()->dims()[1];
-      int out_H = param.Out()->dims()[2];
-      int in_W = inputs[i]->dims()[3];
-      int in_H = inputs[i]->dims()[2];
-      int in_C1 = inputs[i - 1]->dims()[1];
-      int in_C2 = inputs[i]->dims()[1];
-      DLOG << "第" << i << "个";
-      DLOG << "out_C=" << out_C;
-      DLOG << "out_H=" << out_H;
-      DLOG << "in_W=" << in_W;
-      DLOG << "in_H=" << in_H;
-      DLOG << "in_C1=" << in_C1;
-      DLOG << "in_C2=" << in_C2;
-      DLOG << "out_C_Start = " << out_C_Start;
-      status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2);
+      auto input_image_1 = inputs[1]->GetCLImage();
+      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image);
+      int C_0 = inputs[0]->dims()[1];
+      status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C);
+      int C_1 = inputs[1]->dims()[1];
+      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H);
+      arg_offset = 4;
+    } else if (inputs.size() == 3) {
+      auto input_image_0 = inputs[0]->GetCLImage();
+      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W);
+      auto input_image_1 = inputs[1]->GetCLImage();
+      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start);
+      auto input_image_2 = inputs[2]->GetCLImage();
+      status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W);
+      int C_0 = inputs[0]->dims()[1];
+      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H);
+      int C_1 = inputs[1]->dims()[1];
+      status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1);
+      int C_2 = inputs[2]->dims()[1];
+      status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2);
      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2);
+      arg_offset = 6;
-      CL_CHECK_ERRORS(status);
-      status = clEnqueueNDRangeKernel(
-          this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(),
-          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-      out_C_Start += inputs[i]->dims()[1];
    }
+    auto *output_image = param.Out()->GetCLImage();
+    status =
+        clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    int out_C = param.Out()->dims()[1];
+    status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C);
+    CL_CHECK_ERRORS(status);
+    int out_W = param.Out()->dims()[3];
+    status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
  }
 }

--- a/src/operators/kernel/cl/exp_kernel.cpp
+++ b/src/operators/kernel/cl/exp_kernel.cpp
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXP_OP
+#include <framework/cl/cl_tensor.h>
+#include <operators/kernel/exp_kernel.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool EXPKernel<GPU_CL, float>::Init(
+    paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>* param) {
+  this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl");
+  return true;
+}
+template <>
+void EXPKernel<GPU_CL, float>::Compute(
+    const paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
+  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                  NULL, work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class EXPKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/leakyrelu_kernel.cpp
+++ b/src/operators/kernel/cl/leakyrelu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LEAKY_RELU_OP
+#include <operators/kernel/activation_kernel.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool LeakyReluKernel<GPU_CL, float>::Init(
+    paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL> *param) {
+  this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl");
+  return true;
+}
+template <>
+void LeakyReluKernel<GPU_CL, float>::Compute(
+    const paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL>
+        &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  auto input = param.InputX();
+  cl_mem input_image = input->GetCLImage();
+  auto output = param.Out();
+  cl_mem out_image = output->GetCLImage();
+  float alpha = param.Alpha();
+  int out_dims_w = output->dims()[3];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(float), &alpha);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class LeakyReluKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/nearest_interp_kernel.cpp
+++ b/src/operators/kernel/cl/nearest_interp_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef NEAREST_INTERP_OP
+#include <operators/kernel/nearest_interp_kernel.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool NearestInterpolationKernel<GPU_CL, float>::Init(
+    paddle_mobile::operators::NearestInterpolationParam<paddle_mobile::GPU_CL>
+        *param) {
+  this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl");
+  return true;
+}
+template <>
+void NearestInterpolationKernel<GPU_CL, float>::Compute(
+    const paddle_mobile::operators::NearestInterpolationParam<
+        paddle_mobile::GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  auto input = param.InputX();
+  cl_mem input_image = input->GetCLImage();
+  auto output = param.Out();
+  cl_mem output_image = output->GetCLImage();
+  float scale_h = output->dims()[2] / input->dims()[2];
+  float scale_w = output->dims()[3] / input->dims()[3];
+  int in_dims_w = output->dims()[3];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_w);
+  CL_CHECK_ERRORS(status)
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status)
+}
+template class NearestInterpolationKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/sigmoid_kernel.cpp
+++ b/src/operators/kernel/cl/sigmoid_kernel.cpp
@@ -32,11 +32,15 @@ void SigmoidKernel<GPU_CL, float>::Compute(const SigmoidParam<GPU_CL>& param) {
  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
  auto inputImage = input->GetCLImage();
  auto outputImage = output->GetCLImage();
-  clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  cl_int status;
-  clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                         work_size, NULL, 0, NULL, NULL);
+                                  NULL, work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
 }
 template class SigmoidKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/slice_kernel.cpp
+++ b/src/operators/kernel/cl/slice_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SLICE_OP
+#include <framework/cl/cl_tensor.h>
+#include <operators/kernel/slice_kernel.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SliceKernel<GPU_CL, float>::Init(
+    paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> *param) {
+  this->cl_helper_.AddKernel("slice", "slice_kernel.cl");
+  return true;
+}
+template <>
+void SliceKernel<GPU_CL, float>::Compute(
+    const paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_);
+  auto input = param.input_;
+  cl_mem input_image = input->GetCLImage();
+  auto output = param.output_;
+  cl_mem output_image = output->GetCLImage();
+  int starts_0 = param.starts_[0];
+  int ends_0 = param.ends_[0];
+  int axes_0 = param.axes_[0] - (param.original_output_dims_size_ -
+                                 param.output_->dims().size());
+  int dims_w = input->dims()[axes_0 + 2];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class SliceKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/nearest_interp_op.cpp
+++ b/src/operators/nearest_interp_op.cpp
@@ -49,4 +49,8 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp);
 #endif
+#if PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp)
+#endif
 #endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -686,6 +686,7 @@ class ConcatParam : public OpParam {
    inputs_ = InputMultiFrom<GType>(inputs, *scope);
    out_ = OutFrom<GType>(outputs, *scope);
    axis_ = GetAttr<int>("axis", attrs);
+    original_output_dims_size_ = out_->dims().size();
  }
  vector<GType *> Inputs() const { return inputs_; }
@@ -694,10 +695,11 @@ class ConcatParam : public OpParam {
  const int &Axis() const { return axis_; }
- private:
+ public:
  vector<GType *> inputs_;
  GType *out_;
  int axis_;
+  int original_output_dims_size_;
 #ifdef PADDLE_MOBILE_FPGA
 private:
@@ -1590,6 +1592,8 @@ class SliceParam : public OpParam {
    axes_ = GetAttr<std::vector<int>>("axes", attrs);
    starts_ = GetAttr<std::vector<int>>("starts", attrs);
    ends_ = GetAttr<std::vector<int>>("ends", attrs);
+    original_output_dims_size_ = output_->dims().size();
  }
 public:
@@ -1598,6 +1602,7 @@ class SliceParam : public OpParam {
  std::vector<int> axes_;
  std::vector<int> starts_;
  std::vector<int> ends_;
+  int original_output_dims_size_;
 };
 #endif

--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -24,11 +24,42 @@ void SliceOp<Dtype, T>::InferShape() const {
  auto axes = this->param_.axes_;
  auto input = this->param_.input_;
  auto output = this->param_.output_;
+#ifdef PADDLE_MOBILE_CL
+  auto output_dims = output->dims();
+  auto output_dims_size = output_dims.size();
+  bool should_resize = true;
+  if (output_dims_size > 4) {
+    for (int i = 0; i < output_dims_size - 4; ++i) {
+      if (output_dims[i] != 0 && output_dims[i] != 1) {
+        should_resize = false;
+        break;
+      }
+    }
+    if (should_resize) {
+      std::vector<int64_t> temp_output_dims;
+      temp_output_dims.reserve(static_cast<size_t>(4));
+      for (int i = output_dims_size - 4; i < output_dims_size; ++i) {
+        temp_output_dims.push_back(output_dims[i]);
+      }
+      framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
+      this->param_.output_->Resize(temp_ddim);
+    }
+  }
+#endif
  PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1");
  PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(),
                        "input dim size should equals output dim size");
+#ifdef PADDLE_MOBILE_CL
+  PADDLE_MOBILE_ENFORCE(
+      input->dims().size() -
+              (axes[0] - (this->param_.original_output_dims_size_ -
+                          this->param_.output_->dims().size())) ==
+          3,
+      "op only support slice channel now");
+#else
  PADDLE_MOBILE_ENFORCE(input->dims().size() - axes[0] == 3,
                        "op only support slice channel now");
+#endif
 }
 }  // namespace operators
@@ -41,4 +72,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(slice, ops::SliceOp);
 #endif
+#endif  // SLICE_OP
--- a/test/net/test_net.cpp
+++ b/test/net/test_net.cpp
@@ -48,9 +48,11 @@ void test(int argc, char *argv[]) {
  //  config.load_when_predict = true;
  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+  std::cout << "testing opencl yyz " << std::endl;
 #else
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
  paddle_mobile.SetThreadNum(1);
+  std::cout << "testing cpu yyz " << std::endl;
 #endif
  int dim_count = std::stoi(argv[arg_index]);