You need to sign in or sign up before continuing.
未验证 提交 a63d9e9d 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice… (#1773)

* 1.add slice, exp, leakyrelu, nearest op placeholder. 2.complete slice and exp op. 3.fix sigmoid op.

* fix style
上级 237cf93b
...@@ -85,10 +85,18 @@ REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp); ...@@ -85,10 +85,18 @@ REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
#endif #endif
#endif // TANH_OP #endif // TANH_OP
#ifdef PADDLE_MOBILE_CPU
#ifdef LOG_OP #ifdef LOG_OP
REGISTER_OPERATOR_CPU(log, ops::LogOp); REGISTER_OPERATOR_CPU(log, ops::LogOp);
#endif // LOG_OP #endif // LOG_OP
#endif
#ifdef LEAKY_RELU_OP #ifdef LEAKY_RELU_OP
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp); REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp);
#endif // LEAKY_RELU_OP #endif // LEAKY_RELU_OP
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp);
#endif
#endif
...@@ -32,7 +32,9 @@ void ConcatOp<Dtype, T>::InferShape() const { ...@@ -32,7 +32,9 @@ void ConcatOp<Dtype, T>::InferShape() const {
inputs_dims.push_back(inputs[i]->dims()); inputs_dims.push_back(inputs[i]->dims());
} }
auto axis = static_cast<size_t>(this->param_.Axis()); auto axis = static_cast<size_t>(this->param_.Axis()) -
(this->param_.original_output_dims_size_ -
this->param_.Out()->dims().size());
if (n == 1) { if (n == 1) {
DLOG << "Warning: concat op have only one input, " DLOG << "Warning: concat op have only one input, "
......
...@@ -30,7 +30,7 @@ REGISTER_OPERATOR_CPU(exp, ops::EXPOp); ...@@ -30,7 +30,7 @@ REGISTER_OPERATOR_CPU(exp, ops::EXPOp);
#endif #endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
// REGISTER_OPERATOR_CL(exp, ops::EXPOp); REGISTER_OPERATOR_CL(exp, ops::EXPOp);
#endif #endif
#endif #endif
...@@ -14,116 +14,55 @@ limitations under the License. */ ...@@ -14,116 +14,55 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void concatByC0(__read_only image2d_t input_image,
__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0,
__read_only image2d_t input_image_1,
__private const int C_0,
__private const int C_1,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_C,
__private const int out_W) { __private const int out_W) {
// const int in_c = get_global_id(0);
const int in_c = get_global_id(0); // const int in_w = get_global_id(1);
const int in_w = get_global_id(1); // const int in_nh = get_global_id(2);
const int in_nh = get_global_id(2); //
// int2 input_pos ;
int2 input_pos ; // input_pos.x = in_c * out_W + in_w;
input_pos.x = in_c * out_W + in_w; // input_pos.y = in_nh;
input_pos.y = in_nh; // const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | // CLK_ADDRESS_CLAMP |
CLK_ADDRESS_CLAMP | // CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST; // half4 input;
half4 input; // input = read_imageh(input_image, sampler,input_pos);
input = read_imageh(input_image, sampler,input_pos); //
// write_imageh(output_image, input_pos, input);
write_imageh(output_image, input_pos, input);
} }
__kernel void concatByC(__read_only image2d_t input_image1, __kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0,
__read_only image2d_t input_image2, __read_only image2d_t input_image_1,
__read_only image2d_t input_image_2,
__private const int C_0,
__private const int C_1,
__private const int C_2,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_C, __private const int out_C,
__private const int out_H, __private const int out_W) {
__private const int out_W, // const int in_c = get_global_id(0);
__private const int out_C_Start, // const int in_w = get_global_id(1);
__private const int in_W, // const int in_nh = get_global_id(2);
__private const int in_H, //
__private const int in_C1, // int2 input_pos ;
__private const int in_C2) { // input_pos.x = in_c * out_W + in_w;
// input_pos.y = in_nh;
const int in_c = get_global_id(0); // const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
const int in_w = get_global_id(1); // CLK_ADDRESS_CLAMP |
const int in_nh = get_global_id(2); // CLK_FILTER_NEAREST;
int out_c1 = (out_C_Start + 3)/4 -1 + in_c; // half4 input;
// input = read_imageh(input_image, sampler,input_pos);
int out_c2 = out_c1 + 1; //
// write_imageh(output_image, input_pos, input);
int2 output_pos1;
int2 output_pos2;
output_pos1.x = out_c1 * out_W + in_w;
output_pos1.y = in_nh;
output_pos2.x = out_c2 * out_W + in_w;
output_pos2.y = in_nh;
int2 input_pos1;
if(in_c==0){
input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w;
}else{
input_pos1.x = (in_c - 1) * in_W + in_w;
}
input_pos1.y = in_nh;
int2 input_pos2;
input_pos2.x = in_c * in_W + in_w;
input_pos2.y = in_nh;
half4 output1;
half4 output2;
half4 input1;
half4 input2;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
if(in_c==0){
input1 = read_imageh(input_image1, sampler,input_pos1);
}else {
input1 = read_imageh(input_image2, sampler,input_pos1);
}
input2 = read_imageh(input_image2, sampler,input_pos2);
output1 = input1;
if(out_C_Start%4==0){
output2 = input2;
}else if(out_C_Start%4==1){
output1.y = input2.x;
output1.z = input2.y;
output1.w = input2.z;
output2.x = input2.w;
output2.y = 0.0f;
output2.z = 0.0f;
output2.w = 0.0f;
}else if(out_C_Start%4==2){
output1.z = input2.x;
output1.w = input2.y;
output2.x = input2.z;
output2.y = input2.w;
output2.z = 0.0f;
output2.w = 0.0f;
}else if(out_C_Start%4==3){
output1.w = input2.x;
output2.x = input2.y;
output2.y = input2.z;
output2.z = input2.w;
output2.w = 0.0f;
}
write_imageh(output_image, output_pos1, output1);
write_imageh(output_image, output_pos2, output2);
} }
__kernel void concatByH(__read_only image2d_t input_image, __kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_W, __private const int out_W,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) {
const int x = get_global_id(0);
const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, (int2)(x, y));
half4 out;
out.x = pow(2.71828182, (float)(in.x));
out.y = pow(2.71828182, (float)(in.y));
out.z = pow(2.71828182, (float)(in.z));
out.w = pow(2.71828182, (float)(in.w));
write_imageh(output, (int2)(x, y), out);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void leakyrelu(__read_only image2d_t input,
__write_only image2d_t output, __private const float alpha, __private const int dims_w) {
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 input_pos;
input_pos.x = c * dims_w + w;
input_pos.y = nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
half4 output_data;
output_data.x = max((float)(in.x), (float)(alpha * (in.x)));
output_data.y = max((float)(in.x), (float)(alpha * (in.y)));
output_data.z = max((float)(in.x), (float)(alpha * (in.z)));
output_data.w = max((float)(in.x), (float)(alpha * (in.w)));
write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data);
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
__private const float scale_h, __private const float scale_w,
__private const int dims_w){
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
int2 output_pos;
output_pos.x = c * dims_w + w;
output_pos.y = nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
// uint x = (uint)(output_pos.x / scale_w);
// uint y = (uint)(output_pos.y / scale_h);
// half4 input_data = read_imageh(input, sampler, (int2)(x, y));
// write_imageh(output, (int2)(output_pos.x , output_pos.y ), input_data);
}
...@@ -25,6 +25,10 @@ __kernel void sigmoid(__read_only image2d_t input, ...@@ -25,6 +25,10 @@ __kernel void sigmoid(__read_only image2d_t input,
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
half4 in = read_imageh(input, sampler, (int2)(x, y)); half4 in = read_imageh(input, sampler, (int2)(x, y));
in = 1.0f / (1 + exp(-in)); half4 out;
write_imageh(output, (int2)(x, y), in); out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
write_imageh(output, (int2)(x, y), out);
} }
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
__private const int start, __private const int end,
__private const int dims_w){
const int c = get_global_id(0);
const int w = get_global_id(1);
const int nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = c * dims_w + w;
output_pos.y = nh;
int2 input_pos;
half4 input_data;
half4 output_data;
if (start % 4 == 0) {
input_pos.x = (4 * c + start) / 4 * dims_w + w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data = input_data;
} else if (start % 4 == 1) {
input_pos.x = (4 * c + start) / 4 * dims_w + w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.x = input_data.y;
output_data.y = input_data.z;
output_data.z = input_data.w;
input_pos.x = input_pos.x + dims_w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.w = input_data.x;
} else if (start % 4 == 2) {
input_pos.x = (4 * c + start) / 4 * dims_w + w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.x = input_data.z;
output_data.y = input_data.w;
input_pos.x = input_pos.x + dims_w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.z = input_data.x;
output_data.w = input_data.y;
} else if (start % 4 == 3) {
input_pos.x = (4 * c + start) / 4 * dims_w + w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.x = input_data.w;
input_pos.x = input_pos.x + dims_w;
input_pos.y = nh;
input_data = read_imageh(input, sampler,input_pos);
output_data.y = input_data.x;
output_data.z = input_data.y;
output_data.w = input_data.z;
}
write_imageh(output, output_pos, output_data);
}
...@@ -23,16 +23,20 @@ template <> ...@@ -23,16 +23,20 @@ template <>
bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) { bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
if (param->Out()->dims().size() < 4) { if (param->Out()->dims().size() < 4) {
this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
} else if (param->Out()->dims().size() == 4) { } else if (param->Out()->dims().size() >= 4) {
this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl"); if (param->Inputs().size() == 2) {
this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl"); this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl");
} else if (param->Inputs().size() == 3) {
this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl");
} else {
return false;
}
} }
return true; return true;
} }
template <> template <>
void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) { void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
DLOG << "yangfei50";
DLOG << param.Out()->dims(); DLOG << param.Out()->dims();
if (param.Out()->dims().size() < 4) { if (param.Out()->dims().size() < 4) {
auto kernel = this->cl_helper_.KernelAt(0); auto kernel = this->cl_helper_.KernelAt(0);
...@@ -69,74 +73,60 @@ void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) { ...@@ -69,74 +73,60 @@ void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
} }
} else { } else {
auto kernel0 = this->cl_helper_.KernelAt(0); auto kernel0 = this->cl_helper_.KernelAt(0);
auto kernel1 = this->cl_helper_.KernelAt(1); auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
auto inputs = param.Inputs(); auto inputs = param.Inputs();
auto *output_image = param.Out()->GetCLImage(); int arg_offset;
int out_C_Start = 0;
auto input_image = inputs[0]->GetCLImage();
auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]);
int out_W = param.Out()->dims()[3];
cl_int status; cl_int status;
status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image); if (inputs.size() == 2) {
auto input_image_0 = inputs[0]->GetCLImage();
status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image); auto input_image_1 = inputs[1]->GetCLImage();
CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
out_C_Start += inputs[0]->dims()[1]; int C_0 = inputs[0]->dims()[1];
for (int i = 1; i < inputs.size(); i++) { status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0);
auto input_image1 = inputs[i - 1]->GetCLImage();
auto input_image2 = inputs[i]->GetCLImage();
default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
int out_C = param.Out()->dims()[1];
int out_H = param.Out()->dims()[2];
int in_W = inputs[i]->dims()[3];
int in_H = inputs[i]->dims()[2];
int in_C1 = inputs[i - 1]->dims()[1];
int in_C2 = inputs[i]->dims()[1];
DLOG << "第" << i << "个";
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "in_W=" << in_W;
DLOG << "in_H=" << in_H;
DLOG << "in_C1=" << in_C1;
DLOG << "in_C2=" << in_C2;
DLOG << "out_C_Start = " << out_C_Start;
status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2); int C_1 = inputs[1]->dims()[1];
status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image); arg_offset = 4;
} else if (inputs.size() == 3) {
auto input_image_0 = inputs[0]->GetCLImage();
status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C); auto input_image_1 = inputs[1]->GetCLImage();
status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H); auto input_image_2 = inputs[2]->GetCLImage();
status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W); int C_0 = inputs[0]->dims()[1];
status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start); int C_1 = inputs[1]->dims()[1];
status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W); int C_2 = inputs[2]->dims()[1];
status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H); arg_offset = 6;
}
auto *output_image = param.Out()->GetCLImage();
status =
clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1); int out_C = param.Out()->dims()[1];
status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2); int out_W = param.Out()->dims()[3];
status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel( status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(), this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL); NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status); CL_CHECK_ERRORS(status);
out_C_Start += inputs[i]->dims()[1];
}
} }
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef EXP_OP
#include <framework/cl/cl_tensor.h>
#include <operators/kernel/exp_kernel.h>
namespace paddle_mobile {
namespace operators {
template <>
bool EXPKernel<GPU_CL, float>::Init(
paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>* param) {
this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl");
return true;
}
template <>
void EXPKernel<GPU_CL, float>::Compute(
const paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>& param) {
auto kernel = this->cl_helper_.KernelAt(0);
const auto* input = param.InputX();
auto* output = param.Out();
auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class EXPKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LEAKY_RELU_OP
#include <operators/kernel/activation_kernel.h>
namespace paddle_mobile {
namespace operators {
template <>
bool LeakyReluKernel<GPU_CL, float>::Init(
paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL> *param) {
this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl");
return true;
}
template <>
void LeakyReluKernel<GPU_CL, float>::Compute(
const paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL>
&param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
auto input = param.InputX();
cl_mem input_image = input->GetCLImage();
auto output = param.Out();
cl_mem out_image = output->GetCLImage();
float alpha = param.Alpha();
int out_dims_w = output->dims()[3];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(float), &alpha);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class LeakyReluKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef NEAREST_INTERP_OP
#include <operators/kernel/nearest_interp_kernel.h>
namespace paddle_mobile {
namespace operators {
template <>
bool NearestInterpolationKernel<GPU_CL, float>::Init(
paddle_mobile::operators::NearestInterpolationParam<paddle_mobile::GPU_CL>
*param) {
this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl");
return true;
}
template <>
void NearestInterpolationKernel<GPU_CL, float>::Compute(
const paddle_mobile::operators::NearestInterpolationParam<
paddle_mobile::GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
auto input = param.InputX();
cl_mem input_image = input->GetCLImage();
auto output = param.Out();
cl_mem output_image = output->GetCLImage();
float scale_h = output->dims()[2] / input->dims()[2];
float scale_w = output->dims()[3] / input->dims()[3];
int in_dims_w = output->dims()[3];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status)
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status)
status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
CL_CHECK_ERRORS(status)
status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
CL_CHECK_ERRORS(status)
status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_w);
CL_CHECK_ERRORS(status)
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status)
}
template class NearestInterpolationKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -32,11 +32,15 @@ void SigmoidKernel<GPU_CL, float>::Compute(const SigmoidParam<GPU_CL>& param) { ...@@ -32,11 +32,15 @@ void SigmoidKernel<GPU_CL, float>::Compute(const SigmoidParam<GPU_CL>& param) {
auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
auto inputImage = input->GetCLImage(); auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage(); auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); cl_int status;
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
CL_CHECK_ERRORS(status);
const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
work_size, NULL, 0, NULL, NULL); NULL, work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
} }
template class SigmoidKernel<GPU_CL, float>; template class SigmoidKernel<GPU_CL, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SLICE_OP
#include <framework/cl/cl_tensor.h>
#include <operators/kernel/slice_kernel.h>
namespace paddle_mobile {
namespace operators {
template <>
bool SliceKernel<GPU_CL, float>::Init(
paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> *param) {
this->cl_helper_.AddKernel("slice", "slice_kernel.cl");
return true;
}
template <>
void SliceKernel<GPU_CL, float>::Compute(
const paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_);
auto input = param.input_;
cl_mem input_image = input->GetCLImage();
auto output = param.output_;
cl_mem output_image = output->GetCLImage();
int starts_0 = param.starts_[0];
int ends_0 = param.ends_[0];
int axes_0 = param.axes_[0] - (param.original_output_dims_size_ -
param.output_->dims().size());
int dims_w = input->dims()[axes_0 + 2];
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class SliceKernel<GPU_CL, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -49,4 +49,8 @@ namespace ops = paddle_mobile::operators; ...@@ -49,4 +49,8 @@ namespace ops = paddle_mobile::operators;
REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp); REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp);
#endif #endif
#if PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp)
#endif
#endif #endif
...@@ -686,6 +686,7 @@ class ConcatParam : public OpParam { ...@@ -686,6 +686,7 @@ class ConcatParam : public OpParam {
inputs_ = InputMultiFrom<GType>(inputs, *scope); inputs_ = InputMultiFrom<GType>(inputs, *scope);
out_ = OutFrom<GType>(outputs, *scope); out_ = OutFrom<GType>(outputs, *scope);
axis_ = GetAttr<int>("axis", attrs); axis_ = GetAttr<int>("axis", attrs);
original_output_dims_size_ = out_->dims().size();
} }
vector<GType *> Inputs() const { return inputs_; } vector<GType *> Inputs() const { return inputs_; }
...@@ -694,10 +695,11 @@ class ConcatParam : public OpParam { ...@@ -694,10 +695,11 @@ class ConcatParam : public OpParam {
const int &Axis() const { return axis_; } const int &Axis() const { return axis_; }
private: public:
vector<GType *> inputs_; vector<GType *> inputs_;
GType *out_; GType *out_;
int axis_; int axis_;
int original_output_dims_size_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
...@@ -1590,6 +1592,8 @@ class SliceParam : public OpParam { ...@@ -1590,6 +1592,8 @@ class SliceParam : public OpParam {
axes_ = GetAttr<std::vector<int>>("axes", attrs); axes_ = GetAttr<std::vector<int>>("axes", attrs);
starts_ = GetAttr<std::vector<int>>("starts", attrs); starts_ = GetAttr<std::vector<int>>("starts", attrs);
ends_ = GetAttr<std::vector<int>>("ends", attrs); ends_ = GetAttr<std::vector<int>>("ends", attrs);
original_output_dims_size_ = output_->dims().size();
} }
public: public:
...@@ -1598,6 +1602,7 @@ class SliceParam : public OpParam { ...@@ -1598,6 +1602,7 @@ class SliceParam : public OpParam {
std::vector<int> axes_; std::vector<int> axes_;
std::vector<int> starts_; std::vector<int> starts_;
std::vector<int> ends_; std::vector<int> ends_;
int original_output_dims_size_;
}; };
#endif #endif
......
...@@ -24,11 +24,42 @@ void SliceOp<Dtype, T>::InferShape() const { ...@@ -24,11 +24,42 @@ void SliceOp<Dtype, T>::InferShape() const {
auto axes = this->param_.axes_; auto axes = this->param_.axes_;
auto input = this->param_.input_; auto input = this->param_.input_;
auto output = this->param_.output_; auto output = this->param_.output_;
#ifdef PADDLE_MOBILE_CL
auto output_dims = output->dims();
auto output_dims_size = output_dims.size();
bool should_resize = true;
if (output_dims_size > 4) {
for (int i = 0; i < output_dims_size - 4; ++i) {
if (output_dims[i] != 0 && output_dims[i] != 1) {
should_resize = false;
break;
}
}
if (should_resize) {
std::vector<int64_t> temp_output_dims;
temp_output_dims.reserve(static_cast<size_t>(4));
for (int i = output_dims_size - 4; i < output_dims_size; ++i) {
temp_output_dims.push_back(output_dims[i]);
}
framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
this->param_.output_->Resize(temp_ddim);
}
}
#endif
PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1"); PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1");
PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(), PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(),
"input dim size should equals output dim size"); "input dim size should equals output dim size");
#ifdef PADDLE_MOBILE_CL
PADDLE_MOBILE_ENFORCE(
input->dims().size() -
(axes[0] - (this->param_.original_output_dims_size_ -
this->param_.output_->dims().size())) ==
3,
"op only support slice channel now");
#else
PADDLE_MOBILE_ENFORCE(input->dims().size() - axes[0] == 3, PADDLE_MOBILE_ENFORCE(input->dims().size() - axes[0] == 3,
"op only support slice channel now"); "op only support slice channel now");
#endif
} }
} // namespace operators } // namespace operators
...@@ -41,4 +72,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp); ...@@ -41,4 +72,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(slice, ops::SliceOp); REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
#endif #endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(slice, ops::SliceOp);
#endif #endif
#endif // SLICE_OP
...@@ -48,9 +48,11 @@ void test(int argc, char *argv[]) { ...@@ -48,9 +48,11 @@ void test(int argc, char *argv[]) {
// config.load_when_predict = true; // config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config); paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
paddle_mobile.SetCLPath("/data/local/tmp/bin"); paddle_mobile.SetCLPath("/data/local/tmp/bin");
std::cout << "testing opencl yyz " << std::endl;
#else #else
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config); paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
paddle_mobile.SetThreadNum(1); paddle_mobile.SetThreadNum(1);
std::cout << "testing cpu yyz " << std::endl;
#endif #endif
int dim_count = std::stoi(argv[arg_index]); int dim_count = std::stoi(argv[arg_index]);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册