提交 467bbfe7 编写于 作者: X xiebaiyuan 提交者: GitHub

Merge pull request #1319 from yangfei963158659/develop

imp googlenet for gpu
......@@ -68,6 +68,13 @@ class CLImage {
InitCLImage(context, command_queue, folder_converter);
}
void InitNormalCLImage(cl_context context, cl_command_queue command_queue) {
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
InitCLImage(context, command_queue, normal_converter);
}
void InitCLImage(cl_context context, cl_command_queue command_queue,
CLImageConverterBase *converter) {
if (image_converter_ != nullptr) {
......
......@@ -22,7 +22,6 @@ void FeedOp<DeviceType, T>::InferShape() const {
auto out_dims = this->param_.Out()->dims();
out_dims[0] = this->param_.BatchSize();
auto input_dims = this->param_.InputX()->dims();
DLOG << input_dims.size();
if (input_dims.size() == 4) {
this->param_.Out()->Resize(input_dims);
} else {
......
......@@ -60,6 +60,9 @@ REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
#endif
......
......@@ -13,7 +13,27 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
/*
__kernel void concatByC0(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos ;
input_pos.x = in_c * out_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input;
input = read_imageh(input_image, sampler,input_pos);
write_imageh(output_image, input_pos, input);
}
__kernel void concatByC(__read_only image2d_t input_image1,
__read_only image2d_t input_image2,
......@@ -24,13 +44,13 @@ __kernel void concatByC(__read_only image2d_t input_image1,
__private const int out_C_Start,
__private const int in_W,
__private const int in_H,
__private const int int_C1,
__private const int int_C2) {
__private const int in_C1,
__private const int in_C2) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int out_c1 = (out_C_Start)/4 + in_c;
int out_c1 = (out_C_Start + 3)/4 -1 + in_c;
int out_c2 = out_c1 + 1;
......@@ -45,7 +65,7 @@ __kernel void concatByC(__read_only image2d_t input_image1,
int2 input_pos1;
if(in_c==0){
input_pos1.x = ((in_C1-1)/4) * in_W + in_w;
input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w;
}else{
input_pos1.x = (in_c - 1) * in_W + in_w;
}
......@@ -103,26 +123,6 @@ __kernel void concatByC(__read_only image2d_t input_image1,
write_imageh(output_image, output_pos2, output2);
}
__kernel void concatByW0(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos = in_c * out_W + in_w;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input;
input = read_imageh(input_image, sampler,input_pos);
write_imageh(output_image, input_pos, input);
}
*/
__kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image,
......
......@@ -692,6 +692,238 @@ __kernel void conv_1x1_4(__private const int global_size_dim0,
*/
__kernel void conv_7x7(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 7; j++){
for(int k = 0; k < 7; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
__kernel void conv_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 5; j++){
for(int k = 0; k < 5; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void lrn(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_W,
__private const int n,
__private const float k,
__private const float alpha,
__private const float beta){
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4+ 2;
const int out_c3 = out_c * 4+ 3;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const int start = -(n-1)/2;
const end = start + n;
float sqr_sum0 = 0.0f;
float sqr_sum1 = 0.0f;
float sqr_sum2 = 0.0f;
float sqr_sum3 = 0.0f;
int input_c0,input_c1,input_c2,input_c3;
int2 input_pos0,input_pos1,input_pos2,input_pos3;
float4 input0,input1,input2,input3;
for(int i = start; i < end ;i++){
if(out_c0 + i>=0&&out_c0 + i<out_C){
input_c0 = (out_c0 + i)/4;
input_pos0.x = input_c0 * out_W + out_w;
input_pos0.y = out_nh;
input0 = convert_float4(read_imageh(input_image, sampler,input_pos0));
if((out_c0 + i)%4 == 0){
sqr_sum0 += input0.x * input0.x;
}else if((out_c0 + i)%4 == 1){
sqr_sum0 += input0.y * input0.y;
}else if((out_c0 + i)%4 == 2){
sqr_sum0 += input0.z * input0.z;
}else{
sqr_sum0 += input0.w * input0.w;
}
}
if(out_c1 + i>=0&&out_c1 + i<out_C){
input_c1 = (out_c1 + i)/4;
input_pos1.x = input_c1 * out_W + out_w;
input_pos1.y = out_nh;
input1 = convert_float4(read_imageh(input_image, sampler,input_pos1));
if((out_c1 + i)%4 == 0){
sqr_sum1 += input1.x * input1.x;
}else if((out_c1 + i)%4 == 1){
sqr_sum1 += input1.y * input1.y;
}else if((out_c1 + i)%4 == 2){
sqr_sum1 += input1.z * input1.z;
}else{
sqr_sum1 += input1.w * input1.w;
}
}
if(out_c2 + i>=0&&out_c2 + i<out_C){
input_c2 = (out_c2 + i)/4;
input_pos2.x = input_c2 * out_W + out_w;
input_pos2.y = out_nh;
input2 = convert_float4(read_imageh(input_image, sampler,input_pos2));
if((out_c2 + i)%4 == 0){
sqr_sum2 += input2.x * input2.x;
}else if((out_c2 + i)%4 == 1){
sqr_sum2 += input2.y * input2.y;
}else if((out_c2 + i)%4 == 2){
sqr_sum2 += input2.z * input2.z;
}else{
sqr_sum2 += input2.w * input2.w;
}
}
if(out_c3 + i>=0&&out_c3 + i<out_C){
input_c3 = (out_c3 + i)/4;
input_pos3.x = input_c3 * out_W + out_w;
input_pos3.y = out_nh;
input3 = convert_float4(read_imageh(input_image, sampler,input_pos3));
if((out_c3 + i)%4 == 0){
sqr_sum3 += input3.x * input3.x;
}else if((out_c3 + i)%4 == 1){
sqr_sum3 += input3.y * input3.y;
}else if((out_c3 + i)%4 == 2){
sqr_sum3 += input3.z * input3.z;
}else{
sqr_sum3 += input3.w * input3.w;
}
}
}
float4 output = (float4)0.0f;
float4 input;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
input = convert_float4(read_imageh(input_image, sampler,output_pos));
output.x = input.x / (pow(k + alpha * (sqr_sum0),beta));
if(out_C - 4 * out_c>=2){
output.y = input.y / (pow(k + alpha * (sqr_sum1),beta));
}
if(out_C - 4 * out_c>=3){
output.z = input.z / (pow(k + alpha * (sqr_sum2),beta));
}
if(out_C - 4 * out_c>=4){
output.w = input.w / (pow(k + alpha * (sqr_sum3),beta));
}
half4 tmp = convert_half4(output);
write_imageh(output_image, output_pos, tmp);
}
\ No newline at end of file
......@@ -31,11 +31,13 @@ __kernel void pool_max(
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int start_h = max(out_h * stride_h - pad_top, 0);
int start_h = out_h * stride_h - pad_top;
int end_h = min(start_h + ksize_h, in_height);
start_h = max(start_h,0);
int start_w = max(out_w * stride_w - pad_left, 0);
int start_w = out_w * stride_w - pad_left;
int end_w = min(start_w + ksize_w, in_width);
start_w = max(start_w,0);
const int pos_in_x = out_c * in_width;
const int pos_in_y = out_n * in_height;
......
......@@ -23,12 +23,17 @@ template <>
bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
if (param->Out()->dims().size() < 4) {
this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
} else if (param->Out()->dims().size() == 4) {
this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl");
this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl");
}
return true;
}
template <>
void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
DLOG << "yangfei50";
DLOG << param.Out()->dims();
if (param.Out()->dims().size() < 4) {
auto kernel = this->cl_helper_.KernelAt(0);
auto inputs = param.Inputs();
......@@ -62,6 +67,76 @@ void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
out_H_Start += inputs[i]->dims()[0];
}
}
} else {
auto kernel0 = this->cl_helper_.KernelAt(0);
auto kernel1 = this->cl_helper_.KernelAt(1);
auto inputs = param.Inputs();
auto *output_image = param.Out()->GetCLImage();
int out_C_Start = 0;
auto input_image = inputs[0]->GetCLImage();
auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]);
int out_W = param.Out()->dims()[3];
cl_int status;
status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
out_C_Start += inputs[0]->dims()[1];
for (int i = 1; i < inputs.size(); i++) {
auto input_image1 = inputs[i - 1]->GetCLImage();
auto input_image2 = inputs[i]->GetCLImage();
default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
int out_C = param.Out()->dims()[1];
int out_H = param.Out()->dims()[2];
int in_W = inputs[i]->dims()[3];
int in_H = inputs[i]->dims()[2];
int in_C1 = inputs[i - 1]->dims()[1];
int in_C2 = inputs[i]->dims()[1];
DLOG << "第" << i << "个";
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "in_W=" << in_W;
DLOG << "in_H=" << in_H;
DLOG << "in_C1=" << in_C1;
DLOG << "in_C2=" << in_C2;
DLOG << "out_C_Start = " << out_C_Start;
status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
out_C_Start += inputs[i]->dims()[1];
}
}
}
......
......@@ -51,8 +51,16 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
} else if (param->Filter()->dims()[2] == 7 &&
param->Filter()->dims()[3] == 7) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_7x7", "conv_add_kernel.cl");
} else if (param->Filter()->dims()[2] == 5 &&
param->Filter()->dims()[3] == 5) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_5x5", "conv_add_kernel.cl");
}
return true;
......
......@@ -52,6 +52,16 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
} else if (param->Filter()->dims()[2] == 7 &&
param->Filter()->dims()[3] == 7) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_7x7", "conv_add_relu_kernel.cl");
} else if (param->Filter()->dims()[2] == 5 &&
param->Filter()->dims()[3] == 5) {
param->Filter()->InitCLImage(cl_helper_.CLContext(),
cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("conv_5x5", "conv_add_relu_kernel.cl");
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcKernel<GPU_CL, float>::Init(FusionFcParam<GPU_CL> *param) {
param->InputY()->InitNormalCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
return true;
}
template <typename P>
void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
cl_command_queue commandQueue, cl_kernel kernel0,
cl_kernel kernel1) {
auto *input_x_image = param.InputX();
auto *input_y_image = param.InputY();
auto *input_z_image = param.InputZ();
int axis = param.Axis();
auto *out_image = param.Out();
Tensor *input_x = new Tensor();
input_x->Resize(input_x_image->dims());
input_x->mutable_data<float>();
framework::CLImageToTensor(input_x_image, input_x, context, commandQueue,
kernel0);
Tensor *input_y = new Tensor();
input_y->Resize(input_y_image->dims());
input_y->mutable_data<float>();
framework::CLImageToTensor(input_y_image, input_y, context, commandQueue,
kernel0);
Tensor *input_z = new Tensor();
input_z->Resize(input_z_image->dims());
input_z->mutable_data<float>();
framework::CLImageToTensor(input_z_image, input_z, context, commandQueue,
kernel0);
auto *input_z_data = input_z->data<float>();
DLOG << *input_x;
DLOG << *input_y;
DLOG << *input_z;
Tensor *out = new Tensor();
out->Resize(out_image->dims());
out->mutable_data<float>();
auto *out_data = out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
: *input_x;
const Tensor y_matrix =
input_y->dims().size() > 2
? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
: *input_y;
auto out_dim = out->dims();
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
" out_dim.size must be 2.");
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
}
// for (int i = 0; i < out->numel(); i++) {
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1), false);
out_image->InitEmptyImage(context, commandQueue, out->dims());
framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
DLOG << *out;
delete (input_x);
delete (input_y);
delete (input_z);
delete (out);
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
template <>
void FusionFcKernel<GPU_CL, float>::Compute(
const FusionFcParam<GPU_CL> &param) {
auto kernel0 = this->cl_helper_.KernelAt(0);
auto kernel1 = this->cl_helper_.KernelAt(1);
FusionFcCompute<float>(param, this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#include "operators/kernel/lrn_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool LrnKernel<GPU_CL, float>::Init(LrnParam<GPU_CL> *param) {
this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl");
return true;
}
template <>
void LrnKernel<GPU_CL, float>::Compute(const LrnParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
auto input_image = param.InputX()->GetCLImage();
auto x_dims = param.InputX()->dims();
auto output_image = param.Out()->GetCLImage();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int n = param.N();
const float alpha = param.Alpha();
const float beta = param.Beta();
const float k = param.K();
DLOG << "n=" << n;
DLOG << "alpha=" << alpha;
DLOG << "beta=" << beta;
DLOG << "k=" << k;
DLOG << default_work_size;
DLOG << C;
DLOG << W;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &n);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(float), &k);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(float), &alpha);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(float), &beta);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef LRN_OP
#include "lrn_op.h"
#include "operators/lrn_op.h"
namespace paddle_mobile {
namespace operators {
......@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
#endif
......
......@@ -1631,11 +1631,11 @@ class FusionFcParam : public OpParam {
y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
axis_ = GetAttr<int>("axis", attrs);
}
const GType *InputX() const { return input_x_; }
GType *InputX() const { return input_x_; }
const RType *InputY() const { return input_y_; }
RType *InputY() const { return input_y_; }
const RType *InputZ() const { return input_z_; }
RType *InputZ() const { return input_z_; }
GType *Out() const { return out_; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册