From 190cb27806281e04262b7901c9bd1e1f0c70ade7 Mon Sep 17 00:00:00 2001 From: yangfei Date: Wed, 21 Nov 2018 10:09:05 +0800 Subject: [PATCH] imp transpose and reshape kernel --- src/framework/cl/cl_helper.h | 7 + src/framework/cl/cl_image.h | 8 +- src/framework/cl/cl_image_converter.cpp | 37 ++++ src/framework/cl/cl_image_converter.h | 25 +++ .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 8 +- .../kernel/cl/cl_kernel/prior_box_kernel.cl | 40 ++++- src/operators/kernel/cl/cl_kernel/reshape.cl | 158 ++++++++++++++++-- .../kernel/cl/cl_kernel/transpose_kernel.cl | 129 ++++++++++++++ src/operators/kernel/cl/fetch_kernel.cpp | 42 +++-- src/operators/kernel/cl/prior_box_kernel.cpp | 73 +++++--- src/operators/kernel/cl/reshape_kernel.cpp | 80 ++++++--- src/operators/kernel/cl/transpose_kernel.cpp | 38 ++++- src/operators/op_param.h | 2 + 13 files changed, 547 insertions(+), 100 deletions(-) create mode 100644 src/operators/kernel/cl/cl_kernel/transpose_kernel.cl diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h index bea91ee24c..8a79d9bad7 100644 --- a/src/framework/cl/cl_helper.h +++ b/src/framework/cl/cl_helper.h @@ -61,9 +61,16 @@ class CLHelper { auto work_size_2 = n * h; return {work_size_0, work_size_1, work_size_2}; } else if (image_dim.size() == 2) { + auto h = image_dim[0]; + auto w = image_dim[1]; return {1, image.ImageWidth(), image.ImageHeight()}; } else if (image_dim.size() == 1) { return {1, image.ImageWidth(), 1}; + } else if (image_dim.size() == 3) { + int c = image_dim[0]; + int h = image_dim[1]; + int w = image_dim[2]; + return {(c + 3) / 4, w, h}; } PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); } diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 35f60d3b77..0c19661ede 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -120,17 +120,19 @@ class CLImage { PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, " empty image tensor data shouldn't have value"); - CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + // CLImageConverterFolder *folder_converter = new + // CLImageConverterFolder(); + CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); DLOG << " to get image dims "; - image_dims_ = folder_converter->InitImageDimInfoWith(dim); + image_dims_ = normal_converter->InitImageDimInfoWith(dim); DLOG << " end get image dims " << image_dims_; InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); tensor_dims_ = dim; command_queue_ = command_queue; - image_converter_ = folder_converter; + image_converter_ = normal_converter; cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; DLOG << " end init cl image"; diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp index 13094a8d05..1e63bd0567 100644 --- a/src/framework/cl/cl_image_converter.cpp +++ b/src/framework/cl/cl_image_converter.cpp @@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, } } +const DDim &CLImageConverterNormal::InitImageDimInfoWith( + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return make_ddim({width, height}); +} + +void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, + "tensor dim is not support "); + + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); +} + +void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); +} + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h index 02887b0cd4..ad5994f852 100644 --- a/src/framework/cl/cl_image_converter.h +++ b/src/framework/cl/cl_image_converter.h @@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase { int height_of_one_block_; }; +class CLImageConverterNormal : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + class CLImageConverterNWBlock : public CLImageConverterBase { const DDim &InitImageDimInfoWith(const DDim &tensor_dim); void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 63e6e62345..2247df59fb 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0, int2 pos_of_weight; pos_of_weight.x = i * 3 + j % 3; pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - float4 weight_x = read_imagef(filter, sampler, pos_of_weight); + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); output.x += dot(input[j], weight_x); pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - float4 weight_y = read_imagef(filter, sampler, pos_of_weight); + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); output.y += dot(input[j], weight_y); pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - float4 weight_z = read_imagef(filter, sampler, pos_of_weight); + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); output.z += dot(input[j], weight_z); pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - float4 weight_w = read_imagef(filter, sampler, pos_of_weight); + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); output.w += dot(input[j], weight_w); } */ diff --git a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl index 311a5a195a..699d381ce6 100644 --- a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl @@ -19,7 +19,9 @@ __kernel void prior_box(__private const int global_size_dim0, __private const int global_size_dim2, __global float *box_width, __global float *box_height, - __write_only image2d_t output_image, + __global float *variances_Buffer, + __write_only image2d_t output_boxes, + __write_only image2d_t output_variances, __private const float step_width, __private const float step_height, __private const float offset, @@ -44,16 +46,25 @@ __kernel void prior_box(__private const int global_size_dim0, float center_y = ((float)out_n + offset) * step_height; half4 output[4]; + half4 variances[4]; output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width); output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height); output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width); output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].x = convert_half(variances_Buffer[0]); + variances[1].x = convert_half(variances_Buffer[1]); + variances[2].x = convert_half(variances_Buffer[2]); + variances[3].x = convert_half(variances_Buffer[3]); if(C - 4 * out_c>=2){ output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width); output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height); output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width); output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].y = convert_half(variances_Buffer[0]); + variances[1].y = convert_half(variances_Buffer[1]); + variances[2].y = convert_half(variances_Buffer[2]); + variances[3].y = convert_half(variances_Buffer[3]); }else{ output[0].y = 0.0f; output[1].y = 0.0f; @@ -65,6 +76,10 @@ __kernel void prior_box(__private const int global_size_dim0, output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height); output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width); output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].z = convert_half(variances_Buffer[0]); + variances[1].z = convert_half(variances_Buffer[1]); + variances[2].z = convert_half(variances_Buffer[2]); + variances[3].z = convert_half(variances_Buffer[3]); }else{ output[0].z = 0.0f; output[1].z = 0.0f; @@ -76,6 +91,10 @@ __kernel void prior_box(__private const int global_size_dim0, output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height); output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width); output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].w = convert_half(variances_Buffer[0]); + variances[1].w = convert_half(variances_Buffer[1]); + variances[2].w = convert_half(variances_Buffer[2]); + variances[3].w = convert_half(variances_Buffer[3]); }else{ output[0].w = 0.0f; output[1].w = 0.0f; @@ -88,10 +107,21 @@ __kernel void prior_box(__private const int global_size_dim0, output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); } + if(output_pos.x == 0 && output_pos.y == 1){ + float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x); + printf("output = %v4hlf \n", out); + + } + + write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]); + write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]); + write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]); + write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]); + + write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]); + write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]); + write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]); + write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]); - write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); - write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); - write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); } \ No newline at end of file diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl index 0ffc64f15c..7957001c96 100644 --- a/src/operators/kernel/cl/cl_kernel/reshape.cl +++ b/src/operators/kernel/cl/cl_kernel/reshape.cl @@ -14,26 +14,150 @@ limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void reshape(__read_only image2d_t input, - __write_only image2d_t output, - __private const int d0, - __private const int d1, - __private const int d2, - __private const int d3, - __private const int x0, - __private const int x1, - __private const int x2, - __private const int x3) { - const int x = get_global_id(0); - const int y = get_global_id(1); +__kernel void reshape(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H, + __private const int in_Stride0, + __private const int in_Stride1, + __private const int in_Stride2, + __private const int out_Stride0, + __private const int out_Stride1, + __private const int out_Stride2) { - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh/out_H; + const int out_h = out_nh%out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4+ 2; + const int out_c3 = out_c * 4+ 3; + + int count0 = out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w; + int count1 = out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w; + int count2 = out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w; + int count3 = out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w; + + int in_n0 = count0/in_Stride2; + int in_n1 = count1/in_Stride2; + int in_n2 = count1/in_Stride2; + int in_n3 = count2/in_Stride2; + + count0 = count0%in_Stride2; + count1 = count1%in_Stride2; + count2 = count2%in_Stride2; + count3 = count3%in_Stride2; + + int in_c0 = count0/in_Stride1; + int in_c1 = count1/in_Stride1; + int in_c2 = count2/in_Stride1; + int in_c3 = count3/in_Stride1; + + int in_h0 = (count0%in_Stride1)/in_Stride0; + int in_h1 = (count1%in_Stride1)/in_Stride0; + int in_h2 = (count2%in_Stride1)/in_Stride0; + int in_h3 = (count3%in_Stride1)/in_Stride0; + + int in_w0 = (count0%in_Stride1)%in_Stride0; + int in_w1 = (count1%in_Stride1)%in_Stride0; + int in_w2 = (count2%in_Stride1)%in_Stride0; + int in_w3 = (count3%in_Stride1)%in_Stride0; + + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = (in_c0/4) * in_W + in_w0; + input_pos0.y = in_n0 * in_H + in_h0; + + input_pos1.x = (in_c1/4) * in_W + in_w1; + input_pos1.y = in_n1 * in_H + in_h1; + + input_pos2.x = (in_c2/4) * in_W + in_w2; + input_pos2.y = in_n2 * in_H + in_h2; + + input_pos3.x = (in_c3/4) * in_W + in_w3; + input_pos3.y = in_n3 * in_H + in_h3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 input0; + half4 input1; + half4 input2; + half4 input3; + half4 output; + + input0 = read_imageh(input_image, sampler,input_pos0); + if(in_c0%4==0){ + output.x = input0.x; + }else if(in_c0%4==1){ + output.x = input0.y; + }else if(in_c0%4==2){ + output.x = input0.z; + }else{ + output.x = input0.w; + } + if(out_C - out_c * 4>=2){ + input1 = read_imageh(input_image, sampler,input_pos1); + if(in_c1%4==0){ + output.y = input1.x; + }else if(in_c1%4==1){ + output.y = input1.y; + }else if(in_c1%4==2){ + output.y = input1.z; + }else{ + output.y = input1.w; + } + + }else{ + output.y = 0.0f; + } + + if(out_C - out_c * 4>=3){ + input2 = read_imageh(input_image, sampler,input_pos2); + + if(in_c2%4==0){ + output.z = input2.x; + }else if(in_c2%4==1){ + output.z = input1.y; + }else if(in_c2%4==2){ + output.z = input2.z; + }else{ + output.z = input2.w; + } + }else{ + output.z = 0.0f; + } - half4 in = read_imageh(input, sampler, (int2)(x, y)); + if(out_C - out_c * 4>=4){ + input3 = read_imageh(input_image, sampler,input_pos3); + if(in_c3%4==0){ + output.w = input3.x; + }else if(in_c3%4==1){ + output.w = input3.y; + }else if(in_c3%4==2){ + output.w = input3.z; + }else{ + output.w = input3.w; + } + }else{ + output.w = 0.0f; + } - write_imageh(output, (int2)(x, y), in); + write_imageh(output_image, output_pos, output); } diff --git a/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl new file mode 100644 index 0000000000..93d2148c83 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl @@ -0,0 +1,129 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void transpose_4d( __read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W + ){ + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh%out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4+ 2; + const int out_c3 = out_c * 4+ 3; + + const int in_n = out_n; + const int in_c = out_w / 4; + const int in_h0 = out_c0; + const int in_h1 = out_c1; + const int in_h2 = out_c2; + const int in_h3 = out_c3; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_h3; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 input0; + half4 input1; + half4 input2; + half4 input3; + half4 output; + input0 = read_imageh(input_image, sampler,input_pos0); + + if(out_w%4==0){ + output.x = input0.x; + }else if(out_w%4==1){ + output.x = input0.y; + }else if(out_w%4==2){ + output.x = input0.z; + }else{ + output.x = input0.w; + } + if(out_C - out_c * 4>=2){ + input1 = read_imageh(input_image, sampler,input_pos1); + if(out_w%4==0){ + output.y = input1.x; + }else if(out_w%4==1){ + output.y = input1.y; + }else if(out_w%4==2){ + output.y = input1.z; + }else{ + output.y = input1.w; + } + + }else{ + output.y = 0.0f; + } + + if(out_C - out_c * 4>=3){ + input2 = read_imageh(input_image, sampler,input_pos2); + + if(out_w%4==0){ + output.z = input2.x; + }else if(out_w%4==1){ + output.z = input1.y; + }else if(out_w%4==2){ + output.z = input2.z; + }else{ + output.z = input2.w; + } + }else{ + output.z = 0.0f; + } + + if(out_C - out_c * 4>=4){ + input3 = read_imageh(input_image, sampler,input_pos3); + if(out_w%4==0){ + output.w = input3.x; + }else if(out_w%4==1){ + output.w = input3.y; + }else if(out_w%4==2){ + output.w = input3.z; + }else{ + output.w = input3.w; + } + }else{ + output.w = 0.0f; + } + write_imageh(output_image, output_pos, output); +} \ No newline at end of file diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index 8ea0b3ad3d..ded90ff43f 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -22,11 +22,11 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - if (param->InputX()->dims().size() <= 2) { - this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); - } else { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - } + // if (param->InputX()->dims().size() <= 2) { + // this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); + // } else { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + // } return true; } @@ -49,11 +49,11 @@ void FetchKernel::Compute(const FetchParam ¶m) { C = new_dims[1]; in_height = new_dims[2]; - if (dim.size() <= 2) { - in_width = param.InputX()->ImageWidth(); - } else { - in_width = new_dims[3]; - } + // if (dim.size() <= 2) { + // in_width = param.InputX()->ImageWidth(); + // } else { + in_width = new_dims[3]; + // } CLTensor out_cl_tensor(this->cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); @@ -64,16 +64,16 @@ void FetchKernel::Compute(const FetchParam ¶m) { clSetKernelArg(kernel, 1, sizeof(int), &in_width); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); - if (dim.size() > 2) { - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; - int out_c = new_dims[1]; - clSetKernelArg(kernel, 4, sizeof(int), &size_ch); - clSetKernelArg(kernel, 5, sizeof(int), &size_block); - clSetKernelArg(kernel, 6, sizeof(int), &size_batch); - clSetKernelArg(kernel, 7, sizeof(int), &out_c); - } + // if (dim.size() > 2) { + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + int out_c = new_dims[1]; + clSetKernelArg(kernel, 4, sizeof(int), &size_ch); + clSetKernelArg(kernel, 5, sizeof(int), &size_block); + clSetKernelArg(kernel, 6, sizeof(int), &size_batch); + clSetKernelArg(kernel, 7, sizeof(int), &out_c); + // } // cl_event wait_event = param.InpdutX()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, @@ -93,8 +93,6 @@ void FetchKernel::Compute(const FetchParam ¶m) { // << "ms" << std::endl; memcpy(out->data(), out_cl_tensor.Data(), out->memory_size()); - DLOG << *param.InputX(); - DLOG << *out; } template class FetchKernel; diff --git a/src/operators/kernel/cl/prior_box_kernel.cpp b/src/operators/kernel/cl/prior_box_kernel.cpp index 7867cf5ab1..92764b379e 100644 --- a/src/operators/kernel/cl/prior_box_kernel.cpp +++ b/src/operators/kernel/cl/prior_box_kernel.cpp @@ -79,6 +79,8 @@ void PriorBoxKernel::Compute( paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); float *box_height = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); + float *variancesptr = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * 4)); int idx = 0; for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; @@ -112,6 +114,9 @@ void PriorBoxKernel::Compute( } } } + for (int i = 0; i < variances.size(); i++) { + variancesptr[i] = variances[i]; + } cl_int status; auto kernel = this->cl_helper_.KernelAt(0); auto default_work_size = @@ -135,23 +140,33 @@ void PriorBoxKernel::Compute( cl_mem box_height_Buffer = box_height_cl_tensor.mutable_with_data(box_height); - DLOG << "c_block:" << c_block; - DLOG << "w:" << w; - DLOG << "nh:" << nh; - DLOG << "step_width:" << step_width; - DLOG << "step_height:" << step_height; - DLOG << "offset:" << offset; - DLOG << "img_width:" << img_width; - DLOG << "img_height:" << img_height; - DLOG << "num_priors:" << num_priors; - DLOG << "C:" << C; - DLOG << "isclip:" << isclip; - for (int i = 0; i < num_priors; i++) { - DLOG << box_width[i]; - } - for (int i = 0; i < num_priors; i++) { - DLOG << box_height[i]; - } + framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + std::vector variances_shape({4}); + framework::DDim vddim = framework::make_ddim(variances_shape); + + variances_cl_tensor.Resize(vddim); + cl_mem variances_Buffer = + variances_cl_tensor.mutable_with_data(variancesptr); + + // DLOG << "c_block:" << c_block; + // DLOG << "w:" << w; + // DLOG << "nh:" << nh; + // DLOG << "step_width:" << step_width; + // DLOG << "step_height:" << step_height; + // DLOG << "offset:" << offset; + // DLOG << "img_width:" << img_width; + // DLOG << "img_height:" << img_height; + // DLOG << "num_priors:" << num_priors; + // DLOG << "C:" << C; + // DLOG << "isclip:" << isclip; + // printf("param.MinMaxAspectRatiosOrder() = + // %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i < + // num_priors; i++) { + // DLOG << box_width[i]; + // DLOG << box_height[i]; + // } status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -162,30 +177,36 @@ void PriorBoxKernel::Compute( CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(float), &step_width); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(float), &step_height); + status = clSetKernelArg(kernel, 8, sizeof(float), &step_width); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(float), &offset); + status = clSetKernelArg(kernel, 9, sizeof(float), &step_height); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &img_width); + status = clSetKernelArg(kernel, 10, sizeof(float), &offset); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &img_height); + status = clSetKernelArg(kernel, 11, sizeof(int), &img_width); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors); + status = clSetKernelArg(kernel, 12, sizeof(int), &img_height); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &C); + status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 13, sizeof(int), &isclip); + status = clSetKernelArg(kernel, 14, sizeof(int), &C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &isclip); CL_CHECK_ERRORS(status); size_t global_work_size[2] = {c_block, nh}; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); + paddle_mobile::memory::Free(box_width); paddle_mobile::memory::Free(box_height); + paddle_mobile::memory::Free(variancesptr); } template class PriorBoxKernel; diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp index fb3aa9b52f..4e8d3e1d60 100644 --- a/src/operators/kernel/cl/reshape_kernel.cpp +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -26,40 +26,76 @@ bool ReshapeKernel::Init(ReshapeParam *param) { template <> void ReshapeKernel::Compute(const ReshapeParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); const auto *input = param.InputX(); auto *output = param.Out(); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + auto input_image = input->GetCLImage(); + auto output_image = output->GetCLImage(); const auto &inputDim = input->dims(); const auto &outputDim = output->dims(); - int dims[4] = {1, 1, 1, 1}; - int odims[4] = {1, 1, 1, 1}; + int input_dims[4] = {1, 1, 1, 1}; + int output_dims[4] = {1, 1, 1, 1}; // 1 1000 1 1 for (int i = 0; i < inputDim.size(); i++) { - dims[4 - inputDim.size() + i] = inputDim[i]; + input_dims[4 - inputDim.size() + i] = inputDim[i]; } // 1 1 1 1000 for (int i = 0; i < outputDim.size(); i++) { - odims[4 - outputDim.size() + i] = outputDim[i]; + output_dims[4 - outputDim.size() + i] = outputDim[i]; } - clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); - clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); - clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]); - clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]); - clSetKernelArg(kernel, 6, sizeof(cl_int), &odims); - clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]); - clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]); - clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); - const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; - // cl_event out_event = param.Out()->GetClEvent(); - // cl_event wait_event = param.InputX()->GetClEvent(); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, - work_size, NULL, 0, NULL, NULL); + int out_C = output_dims[1]; + int out_H = output_dims[2]; + int out_W = output_dims[3]; + int in_W = input_dims[3]; + int in_H = input_dims[2]; + int in_Stride0 = in_W; + int in_Stride1 = input_dims[2] * input_dims[3]; + int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3]; + int out_Stride0 = out_W; + int out_Stride1 = out_H * out_W; + int out_Stride2 = out_C * out_H * out_W; + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_W=" << in_W; + DLOG << "default_work_size=" << default_work_size; + DLOG << "in_Stride0=" << in_Stride0; + DLOG << "in_Stride1=" << in_Stride1; + DLOG << "out_Stride0=" << out_Stride0; + DLOG << "out_Stride1=" << out_Stride1; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(int), &in_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); } template class ReshapeKernel; diff --git a/src/operators/kernel/cl/transpose_kernel.cpp b/src/operators/kernel/cl/transpose_kernel.cpp index 3b41753f18..b5be025a15 100644 --- a/src/operators/kernel/cl/transpose_kernel.cpp +++ b/src/operators/kernel/cl/transpose_kernel.cpp @@ -20,12 +20,48 @@ namespace operators { template <> bool TransposeKernel::Init(TransposeParam *param) { + if (param->Out()->dims().size() == 4) { + this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl"); + } return true; } template <> void TransposeKernel::Compute( - const TransposeParam ¶m) {} + const TransposeParam ¶m) { + if (param.Out()->dims().size() == 4) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); + int out_C = param.Out()->dims()[1]; + int out_H = param.Out()->dims()[2]; + int out_W = param.Out()->dims()[3]; + int in_W = param.InputX()->dims()[3]; + auto output_image = param.Out()->GetCLImage(); + auto input_image = param.InputX()->GetCLImage(); + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_C=" << in_W; + DLOG << "default_work_size=" << default_work_size; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index cc87d3106f..4d4878789e 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -849,6 +849,8 @@ class PriorBoxParam : public OpParam { if (HasAttr("min_max_aspect_ratios_order", attrs)) { min_max_aspect_ratios_order_ = GetAttr("min_max_aspect_ratios_order", attrs); + } else { + min_max_aspect_ratios_order_ = false; } flip_ = GetAttr("flip", attrs); clip_ = GetAttr("clip", attrs); -- GitLab