adjust gpu code structure

44809658 · zhaojiaying01 · 93c3df7b · 44809658 · 44809658 · 44809658
10 changed file
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
+#include "framework/cl/cl_image_converter.h"
+#include "framework/cl/cl_tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper,
+                                     framework::CLImage &weight){};
+
+template <>
+void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper,
+                           const ConvParam<GPU_CL> &param) {}
+
+void ConvAddBnRelu(framework::CLHelper &cl_helper,
+                   const ConvParam<GPU_CL> &param, bool ifRelu,
+                   const CLImage *biase, const CLImage *new_scale,
+                   const CLImage *new_bias) {
+  auto kernel = cl_helper.KernelAt(0);
+  auto default_work_size = cl_helper.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  //  DLOG << " c block " << c_block;
+  //  DLOG << " w " << w;
+  //  DLOG << " nh " << nh;
+  //  DLOG << " stride " << stride;
+  //  DLOG << " offset " << offset;
+  //  DLOG << " input_c " << input_c;
+  //  DLOG << " dilation " << dilation;
+  //  DLOG << " input width " << input_width;
+  //  DLOG << " input height " << input_height;
+  //  DLOG << " output width " << output_width;
+  //  DLOG << " output height " << output_height;
+  //  DLOG << " input dim " << param.Input()->dims();
+  //  DLOG << " output dim " << param.Output()->dims();
+  //  DLOG << " filter dim " << param.Filter()->dims();
+
+  cl_int status;
+  int index = 0;
+
+  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
+    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    int maped_w = maptofactor(w, 4);
+    status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    if (biase) {
+      auto bias_mem = biase->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    if (new_scale && new_bias) {
+      auto new_scale_mem = new_scale->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+      CL_CHECK_ERRORS(status);
+
+      auto new_bias_mem = new_bias->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    const size_t work_size[3] = {
+        static_cast<const uint32_t>(default_work_size.data()[0]),
+        static_cast<const uint32_t>(maped_w),
+        static_cast<const uint32_t>(default_work_size.data()[2])};
+
+    status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel,
+                                    default_work_size.size(), NULL, work_size,
+                                    NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+    CL_CHECK_ERRORS(status);
+
+    if (biase) {
+      auto bias_mem = biase->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    if (new_scale && new_bias) {
+      auto new_scale_mem = new_scale->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+      CL_CHECK_ERRORS(status);
+
+      auto new_bias_mem = new_bias->GetCLImage();
+      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+      CL_CHECK_ERRORS(status);
+    }
+
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+    CL_CHECK_ERRORS(status);
+
+    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+    CL_CHECK_ERRORS(status);
+
+    status = clEnqueueNDRangeKernel(
+        cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
\ No newline at end of file
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+
+#include "framework/cl/cl_helper.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
+
+template <int tile, int kernel>
+void winograd_transform_weight(framework::CLHelper &cl_helper,
+                               framework::CLImage &weight);
+
+template <int tile, int kernel>
+void WinogradConv3x3(framework::CLHelper &cl_helper,
+                     const ConvParam<GPU_CL> &param);
+
+void ConvAddBnRelu(framework::CLHelper &cl_helper,
+                   const ConvParam<GPU_CL> &param,
+                   bool ifRelu = false,
+                   const CLImage *biase = nullptr,
+                   const CLImage *new_scale = nullptr,
+                   const CLImage *new_bias = nullptr);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <cmath>
 #include "framework/cl/cl_image.h"
 #include "framework/cl/cl_tool.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -171,225 +172,8 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  //  DLOG << " c block " << c_block;
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " offset " << offset;
-  //  DLOG << " input_c " << input_c;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-
-  if (optimise) {
-    if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
-      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-      CL_CHECK_ERRORS(status);
-
-      int maped_w = maptofactor(w, 4);
-      status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 17, sizeof(int), &w);
-      CL_CHECK_ERRORS(status);
-
-      const size_t work_size[3] = {
-          static_cast<const uint32_t>(default_work_size.data()[0]),
-          static_cast<const uint32_t>(maped_w),
-          static_cast<const uint32_t>(default_work_size.data()[2])};
-
-      status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else {
-      status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-      CL_CHECK_ERRORS(status);
-
-      status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-      CL_CHECK_ERRORS(status);
-
-      status = clEnqueueNDRangeKernel(
-          this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    }
-
-  } else {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
+  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(),
+                param.NewBias());
 }

 template class ConvAddBNReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP

 #include "operators/kernel/conv_add_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -73,143 +74,7 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
 template <>
 void ConvAddKernel<GPU_CL, float>::Compute(
    const FusionConvAddParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  param.Output()->InitEmptyImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue(),
-                                 param.Output()->dims());
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  if (optimise_convadd && param.Filter()->dims()[2] == 1 &&
-      param.Filter()->dims()[3] == 1) {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    int maped_w = maptofactor(w, 4);
-    status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 15, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    const size_t work_size[3] = {
-        static_cast<const uint32_t>(default_work_size.data()[0]),
-        static_cast<const uint32_t>(maped_w),
-        static_cast<const uint32_t>(default_work_size.data()[2])};
-
-    status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                    default_work_size.size(), NULL, work_size,
-                                    NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else {
-    status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
+	ConvAddBnRelu(this->cl_helper_, param, false, param.Bias());
 }

 template class ConvAddKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP

 #include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -72,84 +73,7 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvAddReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  DLOG << "---yangfei30---";
-  DLOG << *param.Filter();
-  DLOG << param.Paddings();
-  auto biase = param.Bias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param, true, param.Bias());
 }

 template class ConvAddReluKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/conv_bn_add_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -130,101 +131,7 @@ bool ConvBNAddReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNAddReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNAddReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto biase = param.Bias()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  //              DLOG << " c block " << c_block;
-  //              DLOG << " w " << w;
-  //              DLOG << " nh " << nh;
-  //              DLOG << " stride " << stride;
-  //              DLOG << " offset " << offset;
-  //              DLOG << " input_c " << input_c;
-  //              DLOG << " dilation " << dilation;
-  //              DLOG << " input width " << input_width;
-  //              DLOG << " input height " << input_height;
-  //              DLOG << " output width " << output_width;
-  //              DLOG << " output height " << output_height;
-  //              DLOG << " input dim " << *param.Input();
-  //              DLOG << " output dim " <<* param.Output();
-  //              DLOG << " filter dim " << *param.Filter();
-  //              DLOG<<*param.Bias();
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+	ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), param.NewBias());
 }
 template class ConvBNAddReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/conv_bn_relu_kernel.h"
 #include <cmath>
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -126,81 +127,7 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
 template <>
 void ConvBNReluKernel<GPU_CL, float>::Compute(
    const FusionConvBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+	ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(), param.NewBias());
 }
 template class ConvBNReluKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef CONV_OP

 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/cl/cl-kernel-func/conv_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -66,64 +67,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {

 template <>
 void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  DLOG << " begin set kernel arg ";
-  DLOG << " c block " << c_block;
-  DLOG << " w " << w;
-  DLOG << " nh " << nh;
-  DLOG << " stride " << stride;
-  DLOG << " offset " << offset;
-  DLOG << " input_c " << input_c;
-  DLOG << " dilation " << dilation;
-  DLOG << " input width " << input_width;
-  DLOG << " input height " << input_height;
-  DLOG << " output width " << output_width;
-  DLOG << " output height " << output_height;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
+  ConvAddBnRelu(this->cl_helper_, param);
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -36,9 +36,6 @@ class ConvAddBNReluKernel
 public:
  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
-  inline int maptofactor(int i, int factor) {
-    return (i + factor - 1) / factor;
-  }
 };

 }  // namespace operators

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -41,9 +41,6 @@ class ConvAddKernel
 public:
  void Compute(const FusionConvAddParam<DeviceType> &param);
  bool Init(FusionConvAddParam<DeviceType> *param);
-  inline int maptofactor(int i, int factor) {
-    return (i + factor - 1) / factor;
-  }
 };

 }  // namespace operators