From c5c2d2d24046aeda770ee3d2d307d78d6db4efaa Mon Sep 17 00:00:00 2001
From: yangfei <yangfei11@baidu.com>
Date: Fri, 26 Oct 2018 10:23:05 +0800
Subject: [PATCH] imp conv_add_relu op kernel for gpu

---
 src/operators/fusion_conv_add_relu_op.cpp     |   4 +-
 src/operators/fusion_conv_add_relu_op.h       |  18 ++-
 .../cl/cl_kernel/conv_add_relu_kernel.cl      |  17 ++
 src/operators/kernel/cl/conv_add_kernel.cpp   |  10 +-
 .../kernel/cl/conv_add_relu_kernel.cpp        | 150 ++++++++++++++++++
 5 files changed, 192 insertions(+), 7 deletions(-)
 create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
 create mode 100644 src/operators/kernel/cl/conv_add_relu_kernel.cpp

diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 99b770a6c5..6718bee051 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index 85d7d21637..ed0088b4ab 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -29,9 +29,8 @@ namespace operators {
 class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  public:
   FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+    node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
   void FolderNodes(
@@ -81,6 +80,15 @@ static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
     new FusionConvAddReluOpMatcher());
 #endif
 
+#endif
+#ifdef PADDLE_MOBILE_CL
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif
 
 }  // namespace operators
@@ -95,4 +103,8 @@ USE_OP_CPU(fusion_conv_add_relu);
 USE_OP_FPGA(fusion_conv_add_relu);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+USE_OP_CL(fusion_conv_add_relu);
+#endif
+
 #endif
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
new file mode 100644
index 0000000000..8d686c20df
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define BIASE
+#define RELU
+#include "conv_kernel.inc.cl"
diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp
index d8064fd50e..7e30c6d31d 100644
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -37,10 +37,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
                                 cl_helper_.CLCommandQueue());
 
     this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl");
-  } else if (param->Filter()->dims()[1] == 1) {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
-
     this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl");
 
   } else if (param->Filter()->dims()[2] == 3 &&
@@ -67,6 +68,9 @@ void ConvAddKernel<GPU_CL, float>::Compute(
   int nh = default_work_size[2];
   auto input = param.Input()->GetCLImage();
   auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
   auto biase = param.Bias()->GetCLImage();
   auto output = param.Output()->GetCLImage();
   int stride = param.Strides()[0];
diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
new file mode 100644
index 0000000000..814cff634c
--- /dev/null
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<GPU_CL, float>::Init(
+    FusionConvAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+  param->Bias()->InitCLImage(cl_helper_.CLContext(),
+                             this->cl_helper_.CLCommandQueue());
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  DLOG << "---yangfei30---";
+  DLOG << *param.Filter();
+  DLOG << param.Paddings();
+  auto biase = param.Bias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  //  cl_event out_event = param.Output()->GetClEvent();
+  //  cl_event wait_event = param.Input()->GetClEvent();
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+
+template class ConvAddReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
-- 
GitLab