diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index c0264fefdc0cff0a449d62b5ad8dc581fe1d4600..85a3d0b4093d1803bf5afb25be1bbaee264bd9e3 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -41,8 +41,11 @@ __kernel void conv_1x1(__private const int global_size_dim0, __private const int stride, __private const int offset, __private const int input_c, + __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -112,7 +115,9 @@ __kernel void conv_3x3(__private const int global_size_dim0, __private const int input_c, __private const int dilation, __private const int input_width,/* of one block */ - __private const int input_height/* of one block */) { + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { int2 stride_xy = int2(stride, stride); int2 ouput_pos_in_one_block = int2(out_w, out_nh); int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + int2(offset, offset); diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index e62041d3f47aae8dbc9078d49beb84d45c2d9423..fd846be8024bd2742f6825f08993f17dfcd3509a 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -109,8 +109,11 @@ void ConvAddBNReluKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -124,8 +127,11 @@ void ConvAddBNReluKernel::Compute( clSetKernelArg(kernel, 9, sizeof(int), &stride); clSetKernelArg(kernel, 10, sizeof(int), &offset); clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &dilation); + clSetKernelArg(kernel, 13, sizeof(int), &input_width); + clSetKernelArg(kernel, 14, sizeof(int), &input_height); + clSetKernelArg(kernel, 15, sizeof(int), &output_width); + clSetKernelArg(kernel, 16, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 74de92e4c28709a5fdffa99402b1214982475511..32466dc17d1a4720071f796d17ed2a08790aea8e 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -59,8 +59,11 @@ void ConvAddKernel::Compute( int stride = param.Strides()[0]; int offset = param.Offset(); int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; int input_width = param.Input()->WidthOfOneBlock(); int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); clSetKernelArg(kernel, 0, sizeof(int), &c_block); clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -68,12 +71,15 @@ void ConvAddKernel::Compute( clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - clSetKernelArg(kernel, 9, sizeof(int), &stride); - clSetKernelArg(kernel, 10, sizeof(int), &offset); - clSetKernelArg(kernel, 11, sizeof(int), &input_c); - clSetKernelArg(kernel, 12, sizeof(int), &input_width); - clSetKernelArg(kernel, 13, sizeof(int), &input_height); + clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 7, sizeof(int), &stride); + clSetKernelArg(kernel, 8, sizeof(int), &offset); + clSetKernelArg(kernel, 9, sizeof(int), &input_c); + clSetKernelArg(kernel, 10, sizeof(int), &dilation); + clSetKernelArg(kernel, 11, sizeof(int), &input_width); + clSetKernelArg(kernel, 12, sizeof(int), &input_height); + clSetKernelArg(kernel, 13, sizeof(int), &output_width); + clSetKernelArg(kernel, 14, sizeof(int), &output_height); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73ab8d7e1e328f7577a32199d7a56fc1216d0d83 --- /dev/null +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DepthwiseConvKernel::Init(ConvParam *param) { + DLOG << " depthwise conv kernel init begin "; + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " depthwise conv kernel init end "; + return true; +} + +template <> +void DepthwiseConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = param.Input()->CBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->WidthOfOneBlock(); + int input_height = param.Input()->HeightOfOneBlock(); + int output_width = param.Output()->WidthOfOneBlock(); + int output_height = param.Output()->HeightOfOneBlock(); + + clSetKernelArg(kernel, 0, sizeof(int), &c_block); + clSetKernelArg(kernel, 1, sizeof(int), &w); + clSetKernelArg(kernel, 2, sizeof(int), &nh); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + clSetKernelArg(kernel, 6, sizeof(int), &stride); + clSetKernelArg(kernel, 7, sizeof(int), &offset); + clSetKernelArg(kernel, 8, sizeof(int), &input_c); + clSetKernelArg(kernel, 9, sizeof(int), &dilation); + clSetKernelArg(kernel, 10, sizeof(int), &input_width); + clSetKernelArg(kernel, 11, sizeof(int), &input_height); + clSetKernelArg(kernel, 12, sizeof(int), &output_width); + clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class DepthwiseConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif \ No newline at end of file