From c5c2d2d24046aeda770ee3d2d307d78d6db4efaa Mon Sep 17 00:00:00 2001 From: yangfei Date: Fri, 26 Oct 2018 10:23:05 +0800 Subject: [PATCH] imp conv_add_relu op kernel for gpu --- src/operators/fusion_conv_add_relu_op.cpp | 4 +- src/operators/fusion_conv_add_relu_op.h | 18 ++- .../cl/cl_kernel/conv_add_relu_kernel.cl | 17 ++ src/operators/kernel/cl/conv_add_kernel.cpp | 10 +- .../kernel/cl/conv_add_relu_kernel.cpp | 150 ++++++++++++++++++ 5 files changed, 192 insertions(+), 7 deletions(-) create mode 100644 src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl create mode 100644 src/operators/kernel/cl/conv_add_relu_kernel.cpp diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp index 99b770a6c5..6718bee051 100644 --- a/src/operators/fusion_conv_add_relu_op.cpp +++ b/src/operators/fusion_conv_add_relu_op.cpp @@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp); #endif - +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp); +#endif #endif diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h index 85d7d21637..ed0088b4ab 100644 --- a/src/operators/fusion_conv_add_relu_op.h +++ b/src/operators/fusion_conv_add_relu_op.h @@ -29,9 +29,8 @@ namespace operators { class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher { public: FusionConvAddReluOpMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); + node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD); + node_ > std::make_shared(G_OP_TYPE_RELU); } void FolderNodes( @@ -81,6 +80,15 @@ static framework::FusionOpRegistrar fusion_conv_add_relu_registrar( new FusionConvAddReluOpMatcher()); #endif +#endif +#ifdef PADDLE_MOBILE_CL + +#ifndef CONV_ADD_RELU_REGISTER +#define CONV_ADD_RELU_REGISTER +static framework::FusionOpRegistrar fusion_conv_add_relu_registrar( + new FusionConvAddReluOpMatcher()); +#endif + #endif } // namespace operators @@ -95,4 +103,8 @@ USE_OP_CPU(fusion_conv_add_relu); USE_OP_FPGA(fusion_conv_add_relu); #endif +#ifdef PADDLE_MOBILE_CL +USE_OP_CL(fusion_conv_add_relu); +#endif + #endif diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl new file mode 100644 index 0000000000..8d686c20df --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define RELU +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index d8064fd50e..7e30c6d31d 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -37,10 +37,11 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { cl_helper_.CLCommandQueue()); this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); - } else if (param->Filter()->dims()[1] == 1) { - param->Filter()->InitCLImage(cl_helper_.CLContext(), + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl"); } else if (param->Filter()->dims()[2] == 3 && @@ -67,6 +68,9 @@ void ConvAddKernel::Compute( int nh = default_work_size[2]; auto input = param.Input()->GetCLImage(); auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); auto biase = param.Bias()->GetCLImage(); auto output = param.Output()->GetCLImage(); int stride = param.Strides()[0]; diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp new file mode 100644 index 0000000000..814cff634c --- /dev/null +++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDRELU_OP + +#include "operators/kernel/conv_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddReluKernel::Init( + FusionConvAddReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddReluKernel::Compute( + const FusionConvAddReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif -- GitLab