From 3643a46ca91697c34f776b1dc86cd1ef0c0e381a Mon Sep 17 00:00:00 2001 From: liuruilong Date: Thu, 11 Oct 2018 21:17:35 +0800 Subject: [PATCH] add depth wise conv kernel --- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 9 ++- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 10 ++- src/operators/kernel/cl/conv_add_kernel.cpp | 18 +++-- src/operators/kernel/cl/conv_kernel.cpp | 9 +-- .../kernel/cl/depthwise_conv_kernel.cpp | 81 +++++++++++++++++++ 5 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 src/operators/kernel/cl/depthwise_conv_kernel.cpp diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index c0264fefdc..85a3d0b409 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -41,8 +41,11 @@ __kernel void conv_1x1(__private const int global_size_dim0, __private const int stride, __private const int offset, __private const int input_c, + __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -112,7 +115,9 @@ __kernel void conv_3x3(__private const int global_size_dim0, __private const int input_c, __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { int2 stride_xy = int2(stride, stride); int2 ouput_pos_in_one_block = int2(out_w, out_nh); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index e62041d3f4..fd846be802 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -109,8 +109,11 @@ void ConvAddBNReluKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -124,8 +127,11 @@ void ConvAddBNReluKernel::Compute( clSetKernelArg(kernel, 9, sizeof(int), &stride); clSetKernelArg(kernel, 10, sizeof(int), &offset); clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &dilation); + clSetKernelArg(kernel, 13, sizeof(int), &input_width); + clSetKernelArg(kernel, 14, sizeof(int), &input_height); + clSetKernelArg(kernel, 15, sizeof(int), &output_width); + clSetKernelArg(kernel, 16, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 74de92e4c2..32466dc17d 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -59,8 +59,11 @@ void ConvAddKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -68,12 +71,15 @@ void ConvAddKernel::Compute( clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 9, sizeof(int), &stride); - clSetKernelArg(kernel, 10, sizeof(int), &offset); - clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 7, sizeof(int), &stride); + clSetKernelArg(kernel, 8, sizeof(int), &offset); + clSetKernelArg(kernel, 9, sizeof(int), &input_c); + clSetKernelArg(kernel, 10, sizeof(int), &dilation); + clSetKernelArg(kernel, 11, sizeof(int), &input_width); + clSetKernelArg(kernel, 12, sizeof(int), &input_height); + clSetKernelArg(kernel, 13, sizeof(int), &output_width); + clSetKernelArg(kernel, 14, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index ec265b7992..6d62515b8a 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -60,6 +60,8 @@ void ConvKernel::Compute(const ConvParam ¶m) { int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -73,14 +75,11 @@ void ConvKernel::Compute(const ConvParam ¶m) { clSetKernelArg(kernel, 9, sizeof(int), &dilation); clSetKernelArg(kernel, 10, sizeof(int), &input_width); clSetKernelArg(kernel, 11, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &output_width); + clSetKernelArg(kernel, 13, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); - - // auto kernel = this->cl_helper_.KernelAt(0); - // size_t global_work_size[3] = {1, 2, 3}; - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, - // global_work_size, NULL, 0, NULL, NULL); } template class ConvKernel; diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp new file mode 100644 index 0000000000..73ab8d7e1e --- /dev/null +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DepthwiseConvKernel::Init(ConvParam *param) { + DLOG << " depthwise conv kernel init begin "; + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " depthwise conv kernel init end "; + return true; +} + +template <> +void DepthwiseConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &c_block); + clSetKernelArg(kernel, 1, sizeof(int), &w); + clSetKernelArg(kernel, 2, sizeof(int), &nh); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 6, sizeof(int), &stride); + clSetKernelArg(kernel, 7, sizeof(int), &offset); + clSetKernelArg(kernel, 8, sizeof(int), &input_c); + clSetKernelArg(kernel, 9, sizeof(int), &dilation); + clSetKernelArg(kernel, 10, sizeof(int), &input_width); + clSetKernelArg(kernel, 11, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &output_width); + clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class DepthwiseConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif \ No newline at end of file -- GitLab