diff --git a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fff4693a39f348c7e76859624e735aac274d6e0b --- /dev/null +++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -0,0 +1,211 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" +#include "framework/cl/cl_image_converter.h" +#include "framework/cl/cl_tensor.h" + +namespace paddle_mobile { +namespace operators { + +template <> +void winograd_transform_weight<4, 3>(framework::CLHelper &cl_helper, + framework::CLImage &weight){}; + +template <> +void WinogradConv3x3<4, 3>(framework::CLHelper &cl_helper, + const ConvParam ¶m) {} + +void ConvAddBnRelu(framework::CLHelper &cl_helper, + const ConvParam ¶m, bool ifRelu, + const CLImage *biase, const CLImage *new_scale, + const CLImage *new_bias) { + auto kernel = cl_helper.KernelAt(0); + auto default_work_size = cl_helper.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + // DLOG << " c block " << c_block; + // DLOG << " w " << w; + // DLOG << " nh " << nh; + // DLOG << " stride " << stride; + // DLOG << " offset " << offset; + // DLOG << " input_c " << input_c; + // DLOG << " dilation " << dilation; + // DLOG << " input width " << input_width; + // DLOG << " input height " << input_height; + // DLOG << " output width " << output_width; + // DLOG << " output height " << output_height; + // DLOG << " input dim " << param.Input()->dims(); + // DLOG << " output dim " << param.Output()->dims(); + // DLOG << " filter dim " << param.Filter()->dims(); + + cl_int status; + int index = 0; + + if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { + status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + int maped_w = maptofactor(w, 4); + status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + if (biase) { + auto bias_mem = biase->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); + CL_CHECK_ERRORS(status); + } + + if (new_scale && new_bias) { + auto new_scale_mem = new_scale->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); + CL_CHECK_ERRORS(status); + + auto new_bias_mem = new_bias->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); + CL_CHECK_ERRORS(status); + } + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + const size_t work_size[3] = { + static_cast(default_work_size.data()[0]), + static_cast(maped_w), + static_cast(default_work_size.data()[2])}; + + status = clEnqueueNDRangeKernel(cl_helper.CLCommandQueue(), kernel, + default_work_size.size(), NULL, work_size, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + if (biase) { + auto bias_mem = biase->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem); + CL_CHECK_ERRORS(status); + } + + if (new_scale && new_bias) { + auto new_scale_mem = new_scale->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem); + CL_CHECK_ERRORS(status); + + auto new_bias_mem = new_bias->GetCLImage(); + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem); + CL_CHECK_ERRORS(status); + } + + status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + status = clEnqueueNDRangeKernel( + cl_helper.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } +} + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/src/operators/kernel/cl/cl-kernel-func/conv_func.h new file mode 100644 index 0000000000000000000000000000000000000000..2cecf353fba9ee3d2668719d6365e62be75786e1 --- /dev/null +++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#pragma once + +#include "framework/cl/cl_helper.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using namespace framework; + +inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; } + +template +void winograd_transform_weight(framework::CLHelper &cl_helper, + framework::CLImage &weight); + +template +void WinogradConv3x3(framework::CLHelper &cl_helper, + const ConvParam ¶m); + +void ConvAddBnRelu(framework::CLHelper &cl_helper, + const ConvParam ¶m, bool ifRelu = false, + const CLImage *biase = nullptr, + const CLImage *new_scale = nullptr, + const CLImage *new_bias = nullptr); + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index a89c8abee70af2fa50808fe1c1a7af124097ed16..50925b4ed6b242e455cdafac973c8d511560fe53 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -2157,6 +2157,176 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, write_imageh(output_image, output_pos, output); } +__kernel void convBNAdd_1x1_spl( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int old_w +) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int out_w0 = out_w; + int out_w1 = out_w + global_size_dim1; + int out_w2 = out_w + global_size_dim1 * 2; + int out_w3 = out_w + global_size_dim1 * 3; + +// int out_w1 = out_w + global_size_dim1; +// int out_w2 = out_w + global_size_dim1 * 2; +// int out_w3 = out_w + global_size_dim1 * 3; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + + int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); + int2 in_pos_in_one_block0 = + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); + int2 in_pos_in_one_block1 = + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); + int2 in_pos_in_one_block2 = + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); + + int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); + int2 in_pos_in_one_block3 = + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); + + + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; + + for (int i = 0; i < input_c; ++i) { + // ------------0--------------- + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + half4 input0 = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input0.x, weight0, output0); + output0 = mad(input0.y, weight1, output0); + output0 = mad(input0.z, weight2, output0); + output0 = mad(input0.w, weight3, output0); + + // -------------1-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + half4 input1 = read_imageh(input_image, sampler, pos_in); + // + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output1 = mad(input1.x, weight0, output1); + output1 = mad(input1.y, weight1, output1); + output1 = mad(input1.z, weight2, output1); + output1 = mad(input1.w, weight3, output1); + + // -------------2-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + half4 input2 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output2 = mad(input2.x, weight0, output2); + output2 = mad(input2.y, weight1, output2); + output2 = mad(input2.z, weight2, output2); + output2 = mad(input2.w, weight3, output2); + + // -------------3-------------- + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + half4 input3 = read_imageh(input_image, sampler, pos_in); + + // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + + // 0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + // + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * + // 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i + // * 4 + 3)); + + output3 = mad(input3.x, weight0, output3); + output3 = mad(input3.y, weight1, output3); + output3 = mad(input3.z, weight2, output3); + output3 = mad(input3.w, weight3, output3); + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef BIASE + output0= read_imageh(bias, sampler, (int2)(out_c, 0)); + output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); + output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); + output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); +#endif + int outpos_main = mul24(out_c , old_w); + int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); + + if (out_w0 < old_w) { + write_imageh(output_image, output_pos0, output0); + } + int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); + if (out_w1 < old_w){ + write_imageh(output_image, output_pos1, output1); + } + + int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); + if (out_w2 < old_w){ + write_imageh(output_image, output_pos2, output2); + } + + int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); + if (out_w3 < old_w){ + write_imageh(output_image, output_pos3, output3); + } +} + diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 122df2496c15d152b88168f09113daf7c5ae1d1f..1ca1cb20556f4728a14270c6dc23e3d14544b647 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -18,10 +18,10 @@ limitations under the License. */ #include #include "framework/cl/cl_image.h" #include "framework/cl/cl_tool.h" +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { -bool optimise = true; template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { @@ -139,11 +139,7 @@ bool ConvAddBNReluKernel::Init( if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - if (optimise) { - this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl"); - } else { - this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); - } + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && @@ -171,225 +167,8 @@ bool ConvAddBNReluKernel::Init( template <> void ConvAddBNReluKernel::Compute( const FusionConvAddBNReluParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto biase = param.Bias()->GetCLImage(); - auto new_scale = param.NewScale()->GetCLImage(); - auto new_bias = param.NewBias()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - // DLOG << " c block " << c_block; - // DLOG << " w " << w; - // DLOG << " nh " << nh; - // DLOG << " stride " << stride; - // DLOG << " offset " << offset; - // DLOG << " input_c " << input_c; - // DLOG << " dilation " << dilation; - // DLOG << " input width " << input_width; - // DLOG << " input height " << input_height; - // DLOG << " output width " << output_width; - // DLOG << " output height " << output_height; - // DLOG << " input dim " << param.Input()->dims(); - // DLOG << " output dim " << param.Output()->dims(); - // DLOG << " filter dim " << param.Filter()->dims(); - - cl_int status; - - if (optimise) { - if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - int maped_w = maptofactor(w, 4); - status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 17, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - const size_t work_size[3] = { - static_cast(default_work_size.data()[0]), - static_cast(maped_w), - static_cast(default_work_size.data()[2])}; - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, - default_work_size.size(), NULL, work_size, - NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } - - } else { - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } + ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), + param.NewBias()); } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 7286a22799d34325ccdb3919ac7f335f22371a46..975b8a9a4f7cada7cb2ad5e0af1720d775a8e380 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -15,10 +15,10 @@ limitations under the License. */ #ifdef FUSION_CONVADD_OP #include "operators/kernel/conv_add_kernel.h" +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { -bool optimise_convadd = true; template <> bool ConvAddKernel::Init(FusionConvAddParam *param) { @@ -36,11 +36,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - if (optimise_convadd) { - this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl"); - } else { - this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); - } + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_kernel.cl"); } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { @@ -73,143 +69,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { template <> void ConvAddKernel::Compute( const FusionConvAddParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto biase = param.Bias()->GetCLImage(); - param.Output()->InitEmptyImage(cl_helper_.CLContext(), - cl_helper_.CLCommandQueue(), - param.Output()->dims()); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - if (optimise_convadd && param.Filter()->dims()[2] == 1 && - param.Filter()->dims()[3] == 1) { - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - int maped_w = maptofactor(w, 4); - status = clSetKernelArg(kernel, 1, sizeof(int), &maped_w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - const size_t work_size[3] = { - static_cast(default_work_size.data()[0]), - static_cast(maped_w), - static_cast(default_work_size.data()[2])}; - - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, - default_work_size.size(), NULL, work_size, - NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } else { - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - } + ConvAddBnRelu(this->cl_helper_, param, false, param.Bias()); } template class ConvAddKernel; diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp index 88de4ae2e308f2b55020c314d18551ebe8ae1ea7..6752ddac98e5e5e4d207e481708fc8131712e87d 100644 --- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDRELU_OP #include "operators/kernel/conv_add_relu_kernel.h" +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { @@ -37,7 +38,7 @@ bool ConvAddReluKernel::Init( param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl"); + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_add_relu_kernel.cl"); } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && param->Filter()->dims()[2] == 3) { @@ -72,84 +73,7 @@ bool ConvAddReluKernel::Init( template <> void ConvAddReluKernel::Compute( const FusionConvAddReluParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - DLOG << "---yangfei30---"; - DLOG << *param.Filter(); - DLOG << param.Paddings(); - auto biase = param.Bias()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + ConvAddBnRelu(this->cl_helper_, param, true, param.Bias()); } template class ConvAddReluKernel; diff --git a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp index b0a7f36ffb612d42b081ffd10c22f2ebc730db7c..93cd183a53eb34ca46fb8e49151db3c41e86384b 100644 --- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp @@ -16,6 +16,7 @@ limitations under the License. */ #include "operators/kernel/conv_bn_add_relu_kernel.h" #include +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { @@ -102,7 +103,8 @@ bool ConvBNAddReluKernel::Init( if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl"); + this->cl_helper_.AddKernel("convBNAdd_1x1_spl", + "conv_bn_add_relu_kernel.cl"); DLOG << " conv bn add relu conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && @@ -130,101 +132,8 @@ bool ConvBNAddReluKernel::Init( template <> void ConvBNAddReluKernel::Compute( const FusionConvBNAddReluParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto biase = param.Bias()->GetCLImage(); - auto new_scale = param.NewScale()->GetCLImage(); - auto new_bias = param.NewBias()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - // DLOG << " c block " << c_block; - // DLOG << " w " << w; - // DLOG << " nh " << nh; - // DLOG << " stride " << stride; - // DLOG << " offset " << offset; - // DLOG << " input_c " << input_c; - // DLOG << " dilation " << dilation; - // DLOG << " input width " << input_width; - // DLOG << " input height " << input_height; - // DLOG << " output width " << output_width; - // DLOG << " output height " << output_height; - // DLOG << " input dim " << *param.Input(); - // DLOG << " output dim " <<* param.Output(); - // DLOG << " filter dim " << *param.Filter(); - // DLOG<<*param.Bias(); - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + ConvAddBnRelu(this->cl_helper_, param, true, param.Bias(), param.NewScale(), + param.NewBias()); } template class ConvBNAddReluKernel; diff --git a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp index 12a01e28a7c5dce9150104576a62200e01c4267f..fefdb6ad037a2c99d3943a7eb1a86857ed893be3 100644 --- a/src/operators/kernel/cl/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_bn_relu_kernel.cpp @@ -16,6 +16,7 @@ limitations under the License. */ #include "operators/kernel/conv_bn_relu_kernel.h" #include +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { @@ -100,7 +101,7 @@ bool ConvBNReluKernel::Init( if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1", "conv_bn_relu_kernel.cl"); + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_bn_relu_kernel.cl"); DLOG << " conv bn relu conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && param->Input()->dims()[1] == param->Output()->dims()[1] && @@ -126,81 +127,8 @@ bool ConvBNReluKernel::Init( template <> void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto new_scale = param.NewScale()->GetCLImage(); - auto new_bias = param.NewBias()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 8, sizeof(int), &stride); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 9, sizeof(int), &offset); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 10, sizeof(int), &input_c); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 11, sizeof(int), &dilation); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 12, sizeof(int), &input_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 13, sizeof(int), &input_height); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 14, sizeof(int), &output_width); - CL_CHECK_ERRORS(status); - - status = clSetKernelArg(kernel, 15, sizeof(int), &output_height); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + ConvAddBnRelu(this->cl_helper_, param, true, nullptr, param.NewScale(), + param.NewBias()); } template class ConvBNReluKernel; diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp index 3ea6c8dd4633c3c006777c0de7ce8e2d6c2be94e..116c020039a81212bbd175dbdf9c51a593c0cbbb 100644 --- a/src/operators/kernel/cl/conv_kernel.cpp +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef CONV_OP #include "operators/kernel/conv_kernel.h" +#include "operators/kernel/cl/cl-kernel-func/conv_func.h" namespace paddle_mobile { namespace operators { @@ -39,7 +40,7 @@ bool ConvKernel::Init(ConvParam *param) { if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { param->Filter()->InitNImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); - this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl"); + this->cl_helper_.AddKernel("conv_1x1_spl", "conv_kernel.cl"); DLOG << "conv 1x1"; } else if (param->Filter()->dims()[1] == 1 && @@ -66,64 +67,7 @@ bool ConvKernel::Init(ConvParam *param) { template <> void ConvKernel::Compute(const ConvParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - DLOG << " begin set kernel arg "; - DLOG << " c block " << c_block; - DLOG << " w " << w; - DLOG << " nh " << nh; - DLOG << " stride " << stride; - DLOG << " offset " << offset; - DLOG << " input_c " << input_c; - DLOG << " dilation " << dilation; - DLOG << " input width " << input_width; - DLOG << " input height " << input_height; - DLOG << " output width " << output_width; - DLOG << " output height " << output_height; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); - status = clSetKernelArg(kernel, 6, sizeof(int), &stride); - status = clSetKernelArg(kernel, 7, sizeof(int), &offset); - status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); - status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); - status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); - status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); - - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); + ConvAddBnRelu(this->cl_helper_, param); } template class ConvKernel; diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h index 267ec4889b07fc105f250ecc6fe12ae9599bdb38..919c66106eda1159f14c40e768325f1f5dcf5ff6 100644 --- a/src/operators/kernel/conv_add_bn_relu_kernel.h +++ b/src/operators/kernel/conv_add_bn_relu_kernel.h @@ -36,9 +36,6 @@ class ConvAddBNReluKernel public: void Compute(const FusionConvAddBNReluParam ¶m); bool Init(FusionConvAddBNReluParam *param); - inline int maptofactor(int i, int factor) { - return (i + factor - 1) / factor; - } }; } // namespace operators diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h index 3388c58585c994b3e7b71b5cd54b39e27dba27e0..fd3f279a7829a5803da6e08c0280435443425ad0 100644 --- a/src/operators/kernel/conv_add_kernel.h +++ b/src/operators/kernel/conv_add_kernel.h @@ -41,9 +41,6 @@ class ConvAddKernel public: void Compute(const FusionConvAddParam ¶m); bool Init(FusionConvAddParam *param); - inline int maptofactor(int i, int factor) { - return (i + factor - 1) / factor; - } }; } // namespace operators