diff --git a/src/operators/activation_op.cpp b/src/operators/activation_op.cpp index 65a16dd5dc6f0636eb912380f2d0e3a08ebf5736..1eb7bb3121505693e2551adbf835cf0671e4df6b 100644 --- a/src/operators/activation_op.cpp +++ b/src/operators/activation_op.cpp @@ -85,10 +85,18 @@ REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp); #endif #endif // TANH_OP +#ifdef PADDLE_MOBILE_CPU #ifdef LOG_OP REGISTER_OPERATOR_CPU(log, ops::LogOp); #endif // LOG_OP +#endif #ifdef LEAKY_RELU_OP +#ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp); #endif // LEAKY_RELU_OP + +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp); +#endif +#endif diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp index 10ea7cb1c214892d54ff9eab605008fdc7eca686..622234c952f4b5046f3c8b9b3691a657cc0e51c0 100644 --- a/src/operators/concat_op.cpp +++ b/src/operators/concat_op.cpp @@ -32,7 +32,9 @@ void ConcatOp::InferShape() const { inputs_dims.push_back(inputs[i]->dims()); } - auto axis = static_cast(this->param_.Axis()); + auto axis = static_cast(this->param_.Axis()) - + (this->param_.original_output_dims_size_ - + this->param_.Out()->dims().size()); if (n == 1) { DLOG << "Warning: concat op have only one input, " diff --git a/src/operators/exp_op.cpp b/src/operators/exp_op.cpp index 72ec69ce52eea85bc0ebc9ad53b9ad240f0f9da1..549108d72e6d5ab65b803870b6994b43b5a2f1db 100644 --- a/src/operators/exp_op.cpp +++ b/src/operators/exp_op.cpp @@ -30,7 +30,7 @@ REGISTER_OPERATOR_CPU(exp, ops::EXPOp); #endif #ifdef PADDLE_MOBILE_CL -// REGISTER_OPERATOR_CL(exp, ops::EXPOp); +REGISTER_OPERATOR_CL(exp, ops::EXPOp); #endif #endif diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl index 20cf7b4c48db4191a2bc95b0d952fbaf0ea1dc18..3117770398b515b124f9e0265d13b7d117c3e114 100644 --- a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl @@ -14,116 +14,55 @@ limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void concatByC0(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_W) { - - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - - int2 input_pos ; - input_pos.x = in_c * out_W + in_w; - input_pos.y = in_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input; - input = read_imageh(input_image, sampler,input_pos); - - write_imageh(output_image, input_pos, input); +__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0, + __read_only image2d_t input_image_1, + __private const int C_0, + __private const int C_1, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W) { +// const int in_c = get_global_id(0); +// const int in_w = get_global_id(1); +// const int in_nh = get_global_id(2); +// +// int2 input_pos ; +// input_pos.x = in_c * out_W + in_w; +// input_pos.y = in_nh; +// const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | +// CLK_ADDRESS_CLAMP | +// CLK_FILTER_NEAREST; +// half4 input; +// input = read_imageh(input_image, sampler,input_pos); +// +// write_imageh(output_image, input_pos, input); } -__kernel void concatByC(__read_only image2d_t input_image1, - __read_only image2d_t input_image2, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int out_C_Start, - __private const int in_W, - __private const int in_H, - __private const int in_C1, - __private const int in_C2) { - - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - int out_c1 = (out_C_Start + 3)/4 -1 + in_c; - - int out_c2 = out_c1 + 1; - - int2 output_pos1; - int2 output_pos2; - - output_pos1.x = out_c1 * out_W + in_w; - output_pos1.y = in_nh; - - output_pos2.x = out_c2 * out_W + in_w; - output_pos2.y = in_nh; - - int2 input_pos1; - if(in_c==0){ - input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w; - }else{ - input_pos1.x = (in_c - 1) * in_W + in_w; - } - - input_pos1.y = in_nh; - - int2 input_pos2; - input_pos2.x = in_c * in_W + in_w; - input_pos2.y = in_nh; - - half4 output1; - half4 output2; - half4 input1; - half4 input2; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - if(in_c==0){ - input1 = read_imageh(input_image1, sampler,input_pos1); - - }else { - input1 = read_imageh(input_image2, sampler,input_pos1); - } - input2 = read_imageh(input_image2, sampler,input_pos2); - output1 = input1; - - if(out_C_Start%4==0){ - output2 = input2; - - }else if(out_C_Start%4==1){ - output1.y = input2.x; - output1.z = input2.y; - output1.w = input2.z; - output2.x = input2.w; - output2.y = 0.0f; - output2.z = 0.0f; - output2.w = 0.0f; - - }else if(out_C_Start%4==2){ - output1.z = input2.x; - output1.w = input2.y; - output2.x = input2.z; - output2.y = input2.w; - output2.z = 0.0f; - output2.w = 0.0f; - - }else if(out_C_Start%4==3){ - output1.w = input2.x; - output2.x = input2.y; - output2.y = input2.z; - output2.z = input2.w; - output2.w = 0.0f; - } - write_imageh(output_image, output_pos1, output1); - write_imageh(output_image, output_pos2, output2); +__kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0, + __read_only image2d_t input_image_1, + __read_only image2d_t input_image_2, + __private const int C_0, + __private const int C_1, + __private const int C_2, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W) { +// const int in_c = get_global_id(0); +// const int in_w = get_global_id(1); +// const int in_nh = get_global_id(2); +// +// int2 input_pos ; +// input_pos.x = in_c * out_W + in_w; +// input_pos.y = in_nh; +// const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | +// CLK_ADDRESS_CLAMP | +// CLK_FILTER_NEAREST; +// half4 input; +// input = read_imageh(input_image, sampler,input_pos); +// +// write_imageh(output_image, input_pos, input); } - __kernel void concatByH(__read_only image2d_t input_image, __write_only image2d_t output_image, __private const int out_W, diff --git a/src/operators/kernel/cl/cl_kernel/exp_kernel.cl b/src/operators/kernel/cl/cl_kernel/exp_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..2227aaab47f3acef171d8a92a9b994f401d497a3 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/exp_kernel.cl @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + half4 out; + out.x = pow(2.71828182, (float)(in.x)); + out.y = pow(2.71828182, (float)(in.y)); + out.z = pow(2.71828182, (float)(in.z)); + out.w = pow(2.71828182, (float)(in.w)); + write_imageh(output, (int2)(x, y), out); +} diff --git a/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl b/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..5198158f2581a735b75f65175a5afa7c47b18fc0 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void leakyrelu(__read_only image2d_t input, + __write_only image2d_t output, __private const float alpha, __private const int dims_w) { + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + int2 input_pos; + input_pos.x = c * dims_w + w; + input_pos.y = nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y)); + + half4 output_data; + output_data.x = max((float)(in.x), (float)(alpha * (in.x))); + output_data.y = max((float)(in.x), (float)(alpha * (in.y))); + output_data.z = max((float)(in.x), (float)(alpha * (in.z))); + output_data.w = max((float)(in.x), (float)(alpha * (in.w))); + + write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data); +} diff --git a/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl b/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..c8283e24cbd6b68054bda0590f7e09f084c05148 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output, + __private const float scale_h, __private const float scale_w, + __private const int dims_w){ + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + int2 output_pos; + output_pos.x = c * dims_w + w; + output_pos.y = nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + // uint x = (uint)(output_pos.x / scale_w); + // uint y = (uint)(output_pos.y / scale_h); + // half4 input_data = read_imageh(input, sampler, (int2)(x, y)); + // write_imageh(output, (int2)(output_pos.x , output_pos.y ), input_data); +} diff --git a/src/operators/kernel/cl/cl_kernel/sigmoid.cl b/src/operators/kernel/cl/cl_kernel/sigmoid.cl index d6dd9cfae27a3b8d010ba9449dd28e4a7e5c8335..0a1995d42caad5fcaa2ddde340e575cdb5074a39 100644 --- a/src/operators/kernel/cl/cl_kernel/sigmoid.cl +++ b/src/operators/kernel/cl/cl_kernel/sigmoid.cl @@ -17,14 +17,18 @@ limitations under the License. */ __kernel void sigmoid(__read_only image2d_t input, __write_only image2d_t output){ - const int x = get_global_id(0); - const int y = get_global_id(1); - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - half4 in = read_imageh(input, sampler, (int2)(x, y)); - in = 1.0f / (1 + exp(-in)); - write_imageh(output, (int2)(x, y), in); + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + half4 out; + out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x))); + out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); + out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); + out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); + write_imageh(output, (int2)(x, y), out); } \ No newline at end of file diff --git a/src/operators/kernel/cl/cl_kernel/slice_kernel.cl b/src/operators/kernel/cl/cl_kernel/slice_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..aab8357d824c48ac3cab748b7f7159ed5260f1d2 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/slice_kernel.cl @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void slice(__read_only image2d_t input, __write_only image2d_t output, + __private const int start, __private const int end, + __private const int dims_w){ + + const int c = get_global_id(0); + const int w = get_global_id(1); + const int nh = get_global_id(2); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = c * dims_w + w; + output_pos.y = nh; + + int2 input_pos; + half4 input_data; + half4 output_data; + + if (start % 4 == 0) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data = input_data; + } else if (start % 4 == 1) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.x = input_data.y; + output_data.y = input_data.z; + output_data.z = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.w = input_data.x; + } else if (start % 4 == 2) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.x = input_data.z; + output_data.y = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.z = input_data.x; + output_data.w = input_data.y; + } else if (start % 4 == 3) { + input_pos.x = (4 * c + start) / 4 * dims_w + w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.x = input_data.w; + input_pos.x = input_pos.x + dims_w; + input_pos.y = nh; + input_data = read_imageh(input, sampler,input_pos); + output_data.y = input_data.x; + output_data.z = input_data.y; + output_data.w = input_data.z; + } + write_imageh(output, output_pos, output_data); + +} diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp index c8ff448b3be79c1acfac7e8cd4e32ea4e3c2b3f5..9db04e74dc54a0bb66e779c2092cbd900238f169 100644 --- a/src/operators/kernel/cl/concat_kernel.cpp +++ b/src/operators/kernel/cl/concat_kernel.cpp @@ -23,16 +23,20 @@ template <> bool ConcatKernel::Init(ConcatParam *param) { if (param->Out()->dims().size() < 4) { this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); - } else if (param->Out()->dims().size() == 4) { - this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl"); - this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl"); + } else if (param->Out()->dims().size() >= 4) { + if (param->Inputs().size() == 2) { + this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl"); + } else if (param->Inputs().size() == 3) { + this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl"); + } else { + return false; + } } return true; } template <> void ConcatKernel::Compute(const ConcatParam ¶m) { - DLOG << "yangfei50"; DLOG << param.Out()->dims(); if (param.Out()->dims().size() < 4) { auto kernel = this->cl_helper_.KernelAt(0); @@ -69,74 +73,60 @@ void ConcatKernel::Compute(const ConcatParam ¶m) { } } else { auto kernel0 = this->cl_helper_.KernelAt(0); - auto kernel1 = this->cl_helper_.KernelAt(1); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); auto inputs = param.Inputs(); - auto *output_image = param.Out()->GetCLImage(); - - int out_C_Start = 0; - auto input_image = inputs[0]->GetCLImage(); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]); - int out_W = param.Out()->dims()[3]; + int arg_offset; cl_int status; - status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image); - CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W); - CL_CHECK_ERRORS(status); - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - out_C_Start += inputs[0]->dims()[1]; - for (int i = 1; i < inputs.size(); i++) { - auto input_image1 = inputs[i - 1]->GetCLImage(); - auto input_image2 = inputs[i]->GetCLImage(); - default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); - int out_C = param.Out()->dims()[1]; - int out_H = param.Out()->dims()[2]; - int in_W = inputs[i]->dims()[3]; - int in_H = inputs[i]->dims()[2]; - int in_C1 = inputs[i - 1]->dims()[1]; - int in_C2 = inputs[i]->dims()[1]; - DLOG << "第" << i << "个"; - DLOG << "out_C=" << out_C; - DLOG << "out_H=" << out_H; - DLOG << "in_W=" << in_W; - DLOG << "in_H=" << in_H; - DLOG << "in_C1=" << in_C1; - DLOG << "in_C2=" << in_C2; - DLOG << "out_C_Start = " << out_C_Start; - status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1); + if (inputs.size() == 2) { + auto input_image_0 = inputs[0]->GetCLImage(); + status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2); + auto input_image_1 = inputs[1]->GetCLImage(); + status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image); + int C_0 = inputs[0]->dims()[1]; + status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C); + int C_1 = inputs[1]->dims()[1]; + status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H); + arg_offset = 4; + } else if (inputs.size() == 3) { + auto input_image_0 = inputs[0]->GetCLImage(); + status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W); + auto input_image_1 = inputs[1]->GetCLImage(); + status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start); + auto input_image_2 = inputs[2]->GetCLImage(); + status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W); + int C_0 = inputs[0]->dims()[1]; + status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H); + int C_1 = inputs[1]->dims()[1]; + status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1); + int C_2 = inputs[2]->dims()[1]; + status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2); - CL_CHECK_ERRORS(status); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(), - NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); - - out_C_Start += inputs[i]->dims()[1]; + arg_offset = 6; } + auto *output_image = param.Out()->GetCLImage(); + status = + clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + int out_C = param.Out()->dims()[1]; + status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + int out_W = param.Out()->dims()[3]; + status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); } } diff --git a/src/operators/kernel/cl/exp_kernel.cpp b/src/operators/kernel/cl/exp_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..76cbae1efddf5dedbc787777f68e5470438b8f1b --- /dev/null +++ b/src/operators/kernel/cl/exp_kernel.cpp @@ -0,0 +1,52 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef EXP_OP + +#include +#include +namespace paddle_mobile { +namespace operators { + +template <> +bool EXPKernel::Init( + paddle_mobile::operators::EXPParam* param) { + this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl"); + return true; +} + +template <> +void EXPKernel::Compute( + const paddle_mobile::operators::EXPParam& param) { + auto kernel = this->cl_helper_.KernelAt(0); + const auto* input = param.InputX(); + auto* output = param.Out(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + CL_CHECK_ERRORS(status); + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class EXPKernel; +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/cl/leakyrelu_kernel.cpp b/src/operators/kernel/cl/leakyrelu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9487d57b2c996fff3170535df58dfeb6e6d66203 --- /dev/null +++ b/src/operators/kernel/cl/leakyrelu_kernel.cpp @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef LEAKY_RELU_OP + +#include + +namespace paddle_mobile { +namespace operators { +template <> +bool LeakyReluKernel::Init( + paddle_mobile::operators::LeakyReluParam *param) { + this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl"); + return true; +} + +template <> +void LeakyReluKernel::Compute( + const paddle_mobile::operators::LeakyReluParam + ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); + auto input = param.InputX(); + cl_mem input_image = input->GetCLImage(); + auto output = param.Out(); + cl_mem out_image = output->GetCLImage(); + float alpha = param.Alpha(); + int out_dims_w = output->dims()[3]; + + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(float), &alpha); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} +template class LeakyReluKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/nearest_interp_kernel.cpp b/src/operators/kernel/cl/nearest_interp_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cdd299104441b0f10e09580133ee79b668068757 --- /dev/null +++ b/src/operators/kernel/cl/nearest_interp_kernel.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef NEAREST_INTERP_OP + +#include + +namespace paddle_mobile { +namespace operators { +template <> +bool NearestInterpolationKernel::Init( + paddle_mobile::operators::NearestInterpolationParam + *param) { + this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl"); + return true; +} + +template <> +void NearestInterpolationKernel::Compute( + const paddle_mobile::operators::NearestInterpolationParam< + paddle_mobile::GPU_CL> ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); + auto input = param.InputX(); + cl_mem input_image = input->GetCLImage(); + auto output = param.Out(); + cl_mem output_image = output->GetCLImage(); + float scale_h = output->dims()[2] / input->dims()[2]; + float scale_w = output->dims()[3] / input->dims()[3]; + int in_dims_w = output->dims()[3]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status) + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status) + status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h); + CL_CHECK_ERRORS(status) + status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w); + CL_CHECK_ERRORS(status) + status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_w); + CL_CHECK_ERRORS(status) + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status) +} +template class NearestInterpolationKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/sigmoid_kernel.cpp b/src/operators/kernel/cl/sigmoid_kernel.cpp index 4ac8b54ba998cdc581a89b0f18e1d279f8481bb3..33ce051f4ad60dc3bb8dc9871089d9221406f03d 100644 --- a/src/operators/kernel/cl/sigmoid_kernel.cpp +++ b/src/operators/kernel/cl/sigmoid_kernel.cpp @@ -32,11 +32,15 @@ void SigmoidKernel::Compute(const SigmoidParam& param) { auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + CL_CHECK_ERRORS(status); const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, - work_size, NULL, 0, NULL, NULL); + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); } template class SigmoidKernel; diff --git a/src/operators/kernel/cl/slice_kernel.cpp b/src/operators/kernel/cl/slice_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..446d003219d553a224cd5f144ad72d6392237a65 --- /dev/null +++ b/src/operators/kernel/cl/slice_kernel.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SLICE_OP + +#include +#include + +namespace paddle_mobile { +namespace operators { +template <> +bool SliceKernel::Init( + paddle_mobile::operators::SliceParam *param) { + this->cl_helper_.AddKernel("slice", "slice_kernel.cl"); + return true; +} + +template <> +void SliceKernel::Compute( + const paddle_mobile::operators::SliceParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_); + auto input = param.input_; + cl_mem input_image = input->GetCLImage(); + auto output = param.output_; + cl_mem output_image = output->GetCLImage(); + int starts_0 = param.starts_[0]; + int ends_0 = param.ends_[0]; + int axes_0 = param.axes_[0] - (param.original_output_dims_size_ - + param.output_->dims().size()); + int dims_w = input->dims()[axes_0 + 2]; + + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} +template class SliceKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/nearest_interp_op.cpp b/src/operators/nearest_interp_op.cpp index e9935ba9f0130ec80c8f9531565cdf0873fff3c8..14e71b78f123befd26125f9daa18e2e510844cdb 100644 --- a/src/operators/nearest_interp_op.cpp +++ b/src/operators/nearest_interp_op.cpp @@ -49,4 +49,8 @@ namespace ops = paddle_mobile::operators; REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp); #endif +#if PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp) +#endif + #endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index b8068b8d0d8e69fb65012beff3e98f0525ae6cae..5b0b6a6238d8e23bdd5bb4c4bde00256aeb72799 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -686,6 +686,7 @@ class ConcatParam : public OpParam { inputs_ = InputMultiFrom(inputs, *scope); out_ = OutFrom(outputs, *scope); axis_ = GetAttr("axis", attrs); + original_output_dims_size_ = out_->dims().size(); } vector Inputs() const { return inputs_; } @@ -694,10 +695,11 @@ class ConcatParam : public OpParam { const int &Axis() const { return axis_; } - private: + public: vector inputs_; GType *out_; int axis_; + int original_output_dims_size_; #ifdef PADDLE_MOBILE_FPGA private: @@ -1590,6 +1592,8 @@ class SliceParam : public OpParam { axes_ = GetAttr>("axes", attrs); starts_ = GetAttr>("starts", attrs); ends_ = GetAttr>("ends", attrs); + + original_output_dims_size_ = output_->dims().size(); } public: @@ -1598,6 +1602,7 @@ class SliceParam : public OpParam { std::vector axes_; std::vector starts_; std::vector ends_; + int original_output_dims_size_; }; #endif diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp index 14316d37eb3ea69557e47f47f3d8563523184490..d1794b1fcef091e4b7675fc3a92325ed2d82a8a9 100644 --- a/src/operators/slice_op.cpp +++ b/src/operators/slice_op.cpp @@ -24,11 +24,42 @@ void SliceOp::InferShape() const { auto axes = this->param_.axes_; auto input = this->param_.input_; auto output = this->param_.output_; +#ifdef PADDLE_MOBILE_CL + auto output_dims = output->dims(); + auto output_dims_size = output_dims.size(); + bool should_resize = true; + if (output_dims_size > 4) { + for (int i = 0; i < output_dims_size - 4; ++i) { + if (output_dims[i] != 0 && output_dims[i] != 1) { + should_resize = false; + break; + } + } + if (should_resize) { + std::vector temp_output_dims; + temp_output_dims.reserve(static_cast(4)); + for (int i = output_dims_size - 4; i < output_dims_size; ++i) { + temp_output_dims.push_back(output_dims[i]); + } + framework::DDim temp_ddim = framework::make_ddim(temp_output_dims); + this->param_.output_->Resize(temp_ddim); + } + } +#endif PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1"); PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(), "input dim size should equals output dim size"); +#ifdef PADDLE_MOBILE_CL + PADDLE_MOBILE_ENFORCE( + input->dims().size() - + (axes[0] - (this->param_.original_output_dims_size_ - + this->param_.output_->dims().size())) == + 3, + "op only support slice channel now"); +#else PADDLE_MOBILE_ENFORCE(input->dims().size() - axes[0] == 3, "op only support slice channel now"); +#endif } } // namespace operators @@ -41,4 +72,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(slice, ops::SliceOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(slice, ops::SliceOp); #endif +#endif // SLICE_OP diff --git a/test/net/test_net.cpp b/test/net/test_net.cpp index b952e4a8271e1102c87a89521c4a4126ffe4241d..dba2edc730726f3fbf568df1ae476f19030c53c0 100644 --- a/test/net/test_net.cpp +++ b/test/net/test_net.cpp @@ -48,9 +48,11 @@ void test(int argc, char *argv[]) { // config.load_when_predict = true; paddle_mobile::PaddleMobile paddle_mobile(config); paddle_mobile.SetCLPath("/data/local/tmp/bin"); + std::cout << "testing opencl yyz " << std::endl; #else paddle_mobile::PaddleMobile paddle_mobile(config); paddle_mobile.SetThreadNum(1); + std::cout << "testing cpu yyz " << std::endl; #endif int dim_count = std::stoi(argv[arg_index]);