Merge pull request #1579 from smilejames/develop

adjust gpu code structure

Merge pull request #1579 from smilejames/develop
adjust gpu code structure
8a281e35 · Jiaying Zhao · GitHub · 17b2a7d8 · 5a2334af · 8a281e35
11 changed file
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
+#include "framework/cl/cl_image_converter.h"
+#include "framework/cl/cl_tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper,
+                                     framework::CLImage &weight){};
+
+template <>
+void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper,
+                           const ConvParam<GPU_CL> &param) {}
+
+void ConvAddBnRelu(framework::CLHelper &cl_helper,
+                   const ConvParam<GPU_CL> &param, bool ifRelu,
+                   const CLImage *biase, const CLImage *new_scale,
+                   const CLImage *new_bias) {
+  auto kernel = cl_helper.KernelAt(0);
+  auto default_work_size = cl_helper.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  //  DLOG << " c block " << c_block;
+  //  DLOG << " w " << w;
+  //  DLOG << " nh " << nh;
+  //  DLOG << " stride " << stride;
+  //  DLOG << " offset " << offset;
+  //  DLOG << " input_c " << input_c;
+  //  DLOG << " dilation " << dilation;
+  //  DLOG << " input width " << input_width;
+  //  DLOG << " input height " << input_height;
+  //  DLOG << " output width " << output_width;
+  //  DLOG << " output height " << output_height;
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
+
+  cl_int status;
+  int index = 0;
+
+  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    int maped_w = maptofactor(w, 4);
+    status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    if (biase) {
+      auto bias_mem = biase->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    if (new_scale && new_bias) {
+      auto new_scale_mem = new_scale->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+      CL_CHECK_ERRORS(status);
+
+      auto new_bias_mem = new_bias->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    const size_t work_size[3] = {
+        static_cast<const uint32_t>(default_work_size.data()[0]),
+        static_cast<const uint32_t>(maped_w),
+        static_cast<const uint32_t>(default_work_size.data()[2])};
+
+    status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel,
+                                    default_work_size.size(), NULL, work_size,
+                                    NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    if (biase) {
+      auto bias_mem = biase->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    if (new_scale && new_bias) {
+      auto new_scale_mem = new_scale->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+      CL_CHECK_ERRORS(status);
+
+      auto new_bias_mem = new_bias->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clEnqueueNDRangeKernel(
+        cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+
+#include "framework/cl/cl_helper.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
+
+template <int tile, int kernel>
+void winograd_transform_weight(framework::CLHelper &cl_helper,
+                               framework::CLImage &weight);
+
+template <int tile, int kernel>
+void WinogradConv3x3(framework::CLHelper &cl_helper,
+                     const ConvParam<GPU_CL> &param);
+
+void ConvAddBnRelu(framework::CLHelper &cl_helper,
+                   const ConvParam<GPU_CL> &param, bool ifRelu = false,
+                   const CLImage *biase = nullptr,
+                   const CLImage *new_scale = nullptr,
+                   const CLImage *new_bias = nullptr);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -2157,6 +2157,176 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
  write_imageh(output_image, output_pos, output);
 }

+__kernel void convBNAdd_1x1_spl(
+        __private const int global_size_dim0, __private const int global_size_dim1,
+        __private const int global_size_dim2, __read_only image2d_t input_image,
+        __read_only image2d_t filter,
+#ifdef BIASE
+        __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+        __write_only image2d_t output_image, __private const int stride,
+        __private const int offset, __private const int input_c,
+        __private const int dilation,
+        __private const int input_width,  /* of one block */
+        __private const int input_height, /* of one block */
+        __private const int output_width,
+        __private const int output_height,
+        __private const int old_w
+) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+//  int out_w1 = out_w + global_size_dim1;
+//  int out_w2 = out_w + global_size_dim1 * 2;
+//  int out_w3 = out_w + global_size_dim1 * 3;
+
+  const sampler_t sampler =
+          CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+          ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+          ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+          ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+          ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+  
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
+  half4 output2 = 0.0f;
+  half4 output3 = 0.0f;
+
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    half4 input0 = read_imageh(input_image, sampler, pos_in);
+
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    half4 input1 = read_imageh(input_image, sampler, pos_in);
+    //
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    half4 input2 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    half4 input3 = read_imageh(input_image, sampler, pos_in);
+
+    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
+    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
+    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
+    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
+    //    * 4 + 3));
+
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef BIASE
+  output0= read_imageh(bias, sampler, (int2)(out_c, 0));
+  output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+  output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+#endif
+      
+#ifdef RELU
+  output0 = activation(output0);
+  output1 = activation(output1);
+  output2 = activation(output2);
+  output3 = activation(output3);
+#endif
+  int outpos_main = mul24(out_c , old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+
+  if (out_w0 < old_w) {
+    write_imageh(output_image, output_pos0, output0);
+  }
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  if (out_w1 < old_w){
+    write_imageh(output_image, output_pos1, output1);
+  }
+
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  if (out_w2 < old_w){
+    write_imageh(output_image, output_pos2, output2);
+  }
+
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+  if (out_w3 < old_w){
+    write_imageh(output_image, output_pos3, output3);
+  }
+}
+




--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <cmath>
 #include "framework/cl/cl_image.h"
 #include "framework/cl/cl_tool.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
-bool optimise = true;
 template <>
 bool ConvAddBNReluKernel<GPU_CL, float>::Init(
    FusionConvAddBNReluParam<GPU_CL> *param) {
@@ -139,11 +139,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    if (optimise) {
    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl");
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl");
-    }

    DLOG << " conv add bn relu conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&
@@ -171,225 +167,8 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  //  DLOG << " c block " << c_block;
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " offset " << offset;
-  //  DLOG << " input_c " << input_c;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-
-  if (optimise) {
-    if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
-      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-      CL_CHECK_ERRORS(status);
-
-      int maped_w = maptofactor(w, 4);
-      status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 17, sizeof(int), &w);
-      CL_CHECK_ERRORS(status);
-
-      const size_t work_size[3] = {
-          static_cast<const uint32_t>(default_work_size.data()[0]),
-          static_cast<const uint32_t>(maped_w),
-          static_cast<const uint32_t>(default_work_size.data()[2])};
-
-      status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else {
-      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clEnqueueNDRangeKernel(
-          this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    }
-
-  } else {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
+  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
+                param.NewBias());
 }

 template class ConvAddBNReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -15,10 +15,10 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP

 #include "operators/kernel/conv_add_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
-bool optimise_convadd = true;

 template <>
 bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
@@ -36,11 +36,7 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    if (optimise_convadd) {
    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl");
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
-    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
@@ -73,143 +69,7 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
 template <>
 void ConvAddKernel<GPU_CL, float>::Compute(
    const FusionConvAddParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  param.Output()->InitEmptyImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue(),
-                                 param.Output()->dims());
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  if (optimise_convadd && param.Filter()->dims()[2] == 1 &&
-      param.Filter()->dims()[3] == 1) {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    int maped_w = maptofactor(w, 4);
-    status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 15, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    const size_t work_size[3] = {
-        static_cast<const uint32_t>(default_work_size.data()[0]),
-        static_cast<const uint32_t>(maped_w),
-        static_cast<const uint32_t>(default_work_size.data()[2])};
-
-    status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                    default_work_size.size(), NULL, work_size,
-                                    NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
+  ConvAddBnRelu(this->cl_helper_, param, false, param.Bias());
 }

 template class ConvAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP

 #include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -37,7 +38,7 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());

-    this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl");
+    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_relu_kernel.cl");
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {
@@ -72,84 +73,7 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  DLOG << "---yangfei30---";
-  DLOG << *param.Filter();
-  DLOG << param.Paddings();
-  auto biase = param.Bias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias());
 }

 template class ConvAddReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/conv_bn_add_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -102,7 +103,8 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl");
+    this->cl_helper_.AddKernel("convBNAdd_1x1_spl",
+                               "conv_bn_add_relu_kernel.cl");
    DLOG << " conv bn add relu conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
@@ -130,101 +132,8 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNAddReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  //              DLOG << " c block " << c_block;
-  //              DLOG << " w " << w;
-  //              DLOG << " nh " << nh;
-  //              DLOG << " stride " << stride;
-  //              DLOG << " offset " << offset;
-  //              DLOG << " input_c " << input_c;
-  //              DLOG << " dilation " << dilation;
-  //              DLOG << " input width " << input_width;
-  //              DLOG << " input height " << input_height;
-  //              DLOG << " output width " << output_width;
-  //              DLOG << " output height " << output_height;
-  //              DLOG << " input dim " << *param.Input();
-  //              DLOG << " output dim " <<* param.Output();
-  //              DLOG << " filter dim " << *param.Filter();
-  //              DLOG<<*param.Bias();
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
+                param.NewBias());
 }
 template class ConvBNAddReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/conv_bn_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -100,7 +101,7 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1", "conv_bn_relu_kernel.cl");
+    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_bn_relu_kernel.cl");
    DLOG << " conv bn relu conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
@@ -126,81 +127,8 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(),
+                param.NewBias());
 }
 template class ConvBNReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef CONV_OP

 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -39,7 +40,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl");
+    this->cl_helper_.AddKernel("conv_1x1_spl", "conv_kernel.cl");
    DLOG << "conv 1x1";

  } else if (param->Filter()->dims()[1] == 1 &&
@@ -66,64 +67,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  DLOG << " begin set kernel arg ";
-  DLOG << " c block " << c_block;
-  DLOG << " w " << w;
-  DLOG << " nh " << nh;
-  DLOG << " stride " << stride;
-  DLOG << " offset " << offset;
-  DLOG << " input_c " << input_c;
-  DLOG << " dilation " << dilation;
-  DLOG << " input width " << input_width;
-  DLOG << " input height " << input_height;
-  DLOG << " output width " << output_width;
-  DLOG << " output height " << output_height;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param);
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -36,9 +36,6 @@ class ConvAddBNReluKernel
 public:
  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
-  inline int maptofactor(int i, int factor) {
-    return (i + factor - 1) / factor;
-  }
 };

 }  // namespace operators

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -41,9 +41,6 @@ class ConvAddKernel
 public:
  void Compute(const FusionConvAddParam<DeviceType> &param);
  bool Init(FusionConvAddParam<DeviceType> *param);
-  inline int maptofactor(int i, int factor) {
-    return (i + factor - 1) / factor;
-  }
 };

 }  // namespace operators