diff --git a/README.md b/README.md index 2572f25444dc4268e7a6a3f43cfdc1b38dae8e02..c3a30009825a7f8f9f5c4940a847fc88fe6a840e 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,8 @@ 开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md) -* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) +* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) +* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md) * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md) * [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md) diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b07ee4d819b25ef77729ed868c54b19a3d8699ae --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +/* + +__kernel void concatByC(__read_only image2d_t input_image1, + __read_only image2d_t input_image2, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int out_C_Start, + __private const int in_W, + __private const int in_H, + __private const int int_C1, + __private const int int_C2) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + int out_c1 = (out_C_Start)/4 + in_c; + + int out_c2 = out_c1 + 1; + + int2 output_pos1; + int2 output_pos2; + + output_pos1.x = out_c1 * out_W + in_w; + output_pos1.y = in_nh; + + output_pos2.x = out_c2 * out_W + in_w; + output_pos2.y = in_nh; + + int2 input_pos1; + if(in_c==0){ + input_pos1.x = ((in_C1-1)/4) * in_W + in_w; + }else{ + input_pos1.x = (in_c - 1) * in_W + in_w; + } + + input_pos1.y = in_nh; + + int2 input_pos2; + input_pos2.x = in_c * in_W + in_w; + input_pos2.y = in_nh; + + half4 output1; + half4 output2; + half4 input1; + half4 input2; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + if(in_c==0){ + input1 = read_imageh(input_image1, sampler,input_pos1); + + }else { + input1 = read_imageh(input_image2, sampler,input_pos1); + } + input2 = read_imageh(input_image2, sampler,input_pos2); + output1 = input1; + + if(out_C_Start%4==0){ + output2 = input2; + + }else if(out_C_Start%4==1){ + output1.y = input2.x; + output1.z = input2.y; + output1.w = input2.z; + output2.x = input2.w; + output2.y = 0.0f; + output2.z = 0.0f; + output2.w = 0.0f; + + }else if(out_C_Start%4==2){ + output1.z = input2.x; + output1.w = input2.y; + output2.x = input2.z; + output2.y = input2.w; + output2.z = 0.0f; + output2.w = 0.0f; + + }else if(out_C_Start%4==3){ + output1.w = input2.x; + output2.x = input2.y; + output2.y = input2.z; + output2.z = input2.w; + output2.w = 0.0f; + } + write_imageh(output_image, output_pos1, output1); + write_imageh(output_image, output_pos2, output2); +} + +__kernel void concatByW0(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos = in_c * out_W + in_w; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + input = read_imageh(input_image, sampler,input_pos); + + write_imageh(output_image, input_pos, input); + +} +*/ + +__kernel void concatByH(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const int out_H_Start) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + input = read_imageh(input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x; + output_pos.y = out_H_Start + input_pos.y; + + write_imageh(output_image, output_pos, input); + +} + + diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp index 48fbd03315957f54873e5ee18bc95896ca306554..3deb31e7aa0c408cc2b87c523d324001f75ade88 100644 --- a/src/operators/kernel/cl/concat_kernel.cpp +++ b/src/operators/kernel/cl/concat_kernel.cpp @@ -21,11 +21,49 @@ namespace operators { template <> bool ConcatKernel::Init(ConcatParam *param) { + if (param->Out()->dims().size() < 4) { + this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); + } return true; } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) {} +void ConcatKernel::Compute(const ConcatParam ¶m) { + if (param.Out()->dims().size() < 4) { + auto kernel = this->cl_helper_.KernelAt(0); + auto inputs = param.Inputs(); + auto *output_image = param.Out()->GetCLImage(); + int out_W = 0; + if (param.Out()->dims().size() == 3) { + out_W = param.Out()->dims()[2]; + } else if (param.Out()->dims().size() == 2) { + out_W = param.Out()->dims()[1]; + } + int out_H_Start = 0; + for (int i = 0; i < inputs.size(); i++) { + auto input_image = inputs[i]->GetCLImage(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + if (param.Out()->dims().size() == 3) { + out_H_Start += inputs[i]->dims()[1]; + } else if (param.Out()->dims().size() == 2) { + out_H_Start += inputs[i]->dims()[0]; + } + } + } +} } // namespace operators } // namespace paddle_mobile