imp conv_add_relu op kernel for gpu

c5c2d2d2 · yangfei · a2bd1692 · c5c2d2d2 · c5c2d2d2 · c5c2d2d2
5 changed file
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
 #endif
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -29,9 +29,8 @@ namespace operators {
 class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
  FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+    node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }

  void FolderNodes(
@@ -81,6 +80,15 @@ static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
    new FusionConvAddReluOpMatcher());
 #endif

+#endif
+#ifdef PADDLE_MOBILE_CL
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif

 }  // namespace operators
@@ -95,4 +103,8 @@ USE_OP_CPU(fusion_conv_add_relu);
 USE_OP_FPGA(fusion_conv_add_relu);
 #endif

+#ifdef PADDLE_MOBILE_CL
+USE_OP_CL(fusion_conv_add_relu);
+#endif
+
 #endif
--- a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define RELU
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -37,10 +37,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                cl_helper_.CLCommandQueue());

    this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
-  } else if (param->Filter()->dims()[1] == 1) {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-
    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");

  } else if (param->Filter()->dims()[2] == 3 &&
@@ -67,6 +68,9 @@ void ConvAddKernel<GPU_CL, float>::Compute(
  int nh = default_work_size[2];
  auto input = param.Input()->GetCLImage();
  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
  auto biase = param.Bias()->GetCLImage();
  auto output = param.Output()->GetCLImage();
  int stride = param.Strides()[0];

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<GPU_CL, float>::Init(
+    FusionConvAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvAddReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif