提交 8b71275c 编写于 作者: Z zhangyang

Merge remote-tracking branch 'upstream/develop' into develop

......@@ -61,9 +61,16 @@ class CLHelper {
auto work_size_2 = n * h;
return {work_size_0, work_size_1, work_size_2};
} else if (image_dim.size() == 2) {
auto h = image_dim[0];
auto w = image_dim[1];
return {1, image.ImageWidth(), image.ImageHeight()};
} else if (image_dim.size() == 1) {
return {1, image.ImageWidth(), 1};
} else if (image_dim.size() == 3) {
int c = image_dim[0];
int h = image_dim[1];
int w = image_dim[2];
return {(c + 3) / 4, w, h};
}
PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
}
......
......@@ -120,17 +120,19 @@ class CLImage {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
// CLImageConverterFolder *folder_converter = new
// CLImageConverterFolder();
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
DLOG << " to get image dims ";
image_dims_ = folder_converter->InitImageDimInfoWith(dim);
image_dims_ = normal_converter->InitImageDimInfoWith(dim);
DLOG << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
tensor_dims_ = dim;
command_queue_ = command_queue;
image_converter_ = folder_converter;
image_converter_ = normal_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
......
......@@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
}
}
const DDim &CLImageConverterNormal::InitImageDimInfoWith(
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return make_ddim({width, height});
}
void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
"tensor dim is not support ");
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
}
void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
}
} // namespace framework
} // namespace paddle_mobile
......@@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase {
int height_of_one_block_;
};
class CLImageConverterNormal : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
......
......@@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
float4 weight_x = read_imagef(filter, sampler, pos_of_weight);
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
float4 weight_y = read_imagef(filter, sampler, pos_of_weight);
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
float4 weight_z = read_imagef(filter, sampler, pos_of_weight);
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
float4 weight_w = read_imagef(filter, sampler, pos_of_weight);
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
......
......@@ -19,47 +19,52 @@ __kernel void prior_box(__private const int global_size_dim0,
__private const int global_size_dim2,
__global float *box_width,
__global float *box_height,
__write_only image2d_t output_image,
__global float *variances_Buffer,
__write_only image2d_t output_boxes,
__write_only image2d_t output_variances,
__private const float step_width,
__private const float step_height,
__private const float offset,
__private const int img_width,
__private const int img_height,
__private const int num_priors,
__private const int C){
__private const int C,
__private const int clip){
const int out_c = get_global_id(0);
const int out_nh = get_global_id(1);
const int out_n = out_nh/num_priors;
const int out_h = out_nh%num_priors;
if (out_c >= global_size_dim0 ||out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * 4;
output_pos.y = out_nh;
float center_x0 = (offset + out_c * 4) * step_width;
float center_x1 = (offset + out_c * 4 + 1) * step_width;
float center_x2 = (offset + out_c * 4 + 2) * step_width;
float center_x3 = (offset + out_c * 4 + 3) * step_width;
float center_y = (out_n + offset) * step_height;
float center_x0 = (offset + (float)(out_c * 4)) * step_width;
float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width;
float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width;
float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width;
float center_y = ((float)out_n + offset) * step_height;
half4 output[4];
output[0].x = convert_half((center_x0 - box_width[out_h]) / img_width);
output[1].x = convert_half((center_y - box_height[out_h]) / img_height);
output[2].x = convert_half((center_x0 + box_width[out_h]) / img_width);
output[3].x = convert_half((center_y + box_height[out_h]) / img_height);
half4 variances[4];
output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
variances[0].x = convert_half(variances_Buffer[0]);
variances[1].x = convert_half(variances_Buffer[1]);
variances[2].x = convert_half(variances_Buffer[2]);
variances[3].x = convert_half(variances_Buffer[3]);
if(C - 4 * out_c>=2){
output[0].y = convert_half((center_x1 - box_width[out_h]) / img_width);
output[1].y = convert_half((center_y - box_height[out_h]) / img_height);
output[2].y = convert_half((center_x1 + box_width[out_h]) / img_width);
output[3].y = convert_half((center_y + box_height[out_h]) / img_height);
output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
variances[0].y = convert_half(variances_Buffer[0]);
variances[1].y = convert_half(variances_Buffer[1]);
variances[2].y = convert_half(variances_Buffer[2]);
variances[3].y = convert_half(variances_Buffer[3]);
}else{
output[0].y = 0.0f;
output[1].y = 0.0f;
......@@ -67,10 +72,14 @@ __kernel void prior_box(__private const int global_size_dim0,
output[3].y = 0.0f;
}
if(C - 4 * out_c>=3){
output[0].z = convert_half((center_x2 - box_width[out_h]) / img_width);
output[1].z = convert_half((center_y - box_height[out_h]) / img_height);
output[2].z = convert_half((center_x2 + box_width[out_h]) / img_width);
output[3].z = convert_half((center_y + box_height[out_h]) / img_height);
output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width);
output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
variances[0].z = convert_half(variances_Buffer[0]);
variances[1].z = convert_half(variances_Buffer[1]);
variances[2].z = convert_half(variances_Buffer[2]);
variances[3].z = convert_half(variances_Buffer[3]);
}else{
output[0].z = 0.0f;
output[1].z = 0.0f;
......@@ -78,23 +87,41 @@ __kernel void prior_box(__private const int global_size_dim0,
output[3].z = 0.0f;
}
if(C - 4 * out_c>=4){
output[0].w = convert_half((center_x3 - box_width[out_h]) / img_width);
output[1].w = convert_half((center_y - box_height[out_h]) / img_height);
output[2].w = convert_half((center_x3 + box_width[out_h]) / img_width);
output[3].w = convert_half((center_y + box_height[out_h]) / img_height);
output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width);
output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
variances[0].w = convert_half(variances_Buffer[0]);
variances[1].w = convert_half(variances_Buffer[1]);
variances[2].w = convert_half(variances_Buffer[2]);
variances[3].w = convert_half(variances_Buffer[3]);
}else{
output[0].z = 0.0f;
output[1].z = 0.0f;
output[2].z = 0.0f;
output[3].z = 0.0f;
output[0].w = 0.0f;
output[1].w = 0.0f;
output[2].w = 0.0f;
output[3].w = 0.0f;
}
if(clip==1){
output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
}
output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[0]);
write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[1]);
write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[2]);
write_imageh(output_image, (int2)(output_pos.x + 4, output_pos.y), output[3]);
if(output_pos.x == 0 && output_pos.y == 1){
float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
printf("output = %v4hlf \n", out);
}
write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
}
\ No newline at end of file
......@@ -14,26 +14,150 @@ limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void reshape(__read_only image2d_t input,
__write_only image2d_t output,
__private const int d0,
__private const int d1,
__private const int d2,
__private const int d3,
__private const int x0,
__private const int x1,
__private const int x2,
__private const int x3) {
const int x = get_global_id(0);
const int y = get_global_id(1);
__kernel void reshape(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W,
__private const int in_H,
__private const int in_Stride0,
__private const int in_Stride1,
__private const int in_Stride2,
__private const int out_Stride0,
__private const int out_Stride1,
__private const int out_Stride2) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh/out_H;
const int out_h = out_nh%out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4+ 2;
const int out_c3 = out_c * 4+ 3;
int count0 = out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
int count1 = out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
int count2 = out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
int count3 = out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
int in_n0 = count0/in_Stride2;
int in_n1 = count1/in_Stride2;
int in_n2 = count1/in_Stride2;
int in_n3 = count2/in_Stride2;
count0 = count0%in_Stride2;
count1 = count1%in_Stride2;
count2 = count2%in_Stride2;
count3 = count3%in_Stride2;
int in_c0 = count0/in_Stride1;
int in_c1 = count1/in_Stride1;
int in_c2 = count2/in_Stride1;
int in_c3 = count3/in_Stride1;
int in_h0 = (count0%in_Stride1)/in_Stride0;
int in_h1 = (count1%in_Stride1)/in_Stride0;
int in_h2 = (count2%in_Stride1)/in_Stride0;
int in_h3 = (count3%in_Stride1)/in_Stride0;
int in_w0 = (count0%in_Stride1)%in_Stride0;
int in_w1 = (count1%in_Stride1)%in_Stride0;
int in_w2 = (count2%in_Stride1)%in_Stride0;
int in_w3 = (count3%in_Stride1)%in_Stride0;
int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = (in_c0/4) * in_W + in_w0;
input_pos0.y = in_n0 * in_H + in_h0;
input_pos1.x = (in_c1/4) * in_W + in_w1;
input_pos1.y = in_n1 * in_H + in_h1;
input_pos2.x = (in_c2/4) * in_W + in_w2;
input_pos2.y = in_n2 * in_H + in_h2;
input_pos3.x = (in_c3/4) * in_W + in_w3;
input_pos3.y = in_n3 * in_H + in_h3;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input0;
half4 input1;
half4 input2;
half4 input3;
half4 output;
input0 = read_imageh(input_image, sampler,input_pos0);
if(in_c0%4==0){
output.x = input0.x;
}else if(in_c0%4==1){
output.x = input0.y;
}else if(in_c0%4==2){
output.x = input0.z;
}else{
output.x = input0.w;
}
if(out_C - out_c * 4>=2){
input1 = read_imageh(input_image, sampler,input_pos1);
if(in_c1%4==0){
output.y = input1.x;
}else if(in_c1%4==1){
output.y = input1.y;
}else if(in_c1%4==2){
output.y = input1.z;
}else{
output.y = input1.w;
}
}else{
output.y = 0.0f;
}
if(out_C - out_c * 4>=3){
input2 = read_imageh(input_image, sampler,input_pos2);
if(in_c2%4==0){
output.z = input2.x;
}else if(in_c2%4==1){
output.z = input1.y;
}else if(in_c2%4==2){
output.z = input2.z;
}else{
output.z = input2.w;
}
}else{
output.z = 0.0f;
}
half4 in = read_imageh(input, sampler, (int2)(x, y));
if(out_C - out_c * 4>=4){
input3 = read_imageh(input_image, sampler,input_pos3);
if(in_c3%4==0){
output.w = input3.x;
}else if(in_c3%4==1){
output.w = input3.y;
}else if(in_c3%4==2){
output.w = input3.z;
}else{
output.w = input3.w;
}
}else{
output.w = 0.0f;
}
write_imageh(output, (int2)(x, y), in);
write_imageh(output_image, output_pos, output);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void transpose_4d( __read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W
){
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = 1;
const int out_h = out_nh%out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4+ 2;
const int out_c3 = out_c * 4+ 3;
const int in_n = out_n;
const int in_c = out_w / 4;
const int in_h0 = out_c0;
const int in_h1 = out_c1;
const int in_h2 = out_c2;
const int in_h3 = out_c3;
const int in_w = out_h;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_h0;
input_pos1.x = in_W * in_c + in_w;
input_pos1.y = in_n * in_h1;
input_pos2.x = in_W * in_c + in_w;
input_pos2.y = in_n * in_h2;
input_pos3.x = in_W * in_c + in_w;
input_pos3.y = in_n * in_h3;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
half4 input0;
half4 input1;
half4 input2;
half4 input3;
half4 output;
input0 = read_imageh(input_image, sampler,input_pos0);
if(out_w%4==0){
output.x = input0.x;
}else if(out_w%4==1){
output.x = input0.y;
}else if(out_w%4==2){
output.x = input0.z;
}else{
output.x = input0.w;
}
if(out_C - out_c * 4>=2){
input1 = read_imageh(input_image, sampler,input_pos1);
if(out_w%4==0){
output.y = input1.x;
}else if(out_w%4==1){
output.y = input1.y;
}else if(out_w%4==2){
output.y = input1.z;
}else{
output.y = input1.w;
}
}else{
output.y = 0.0f;
}
if(out_C - out_c * 4>=3){
input2 = read_imageh(input_image, sampler,input_pos2);
if(out_w%4==0){
output.z = input2.x;
}else if(out_w%4==1){
output.z = input1.y;
}else if(out_w%4==2){
output.z = input2.z;
}else{
output.z = input2.w;
}
}else{
output.z = 0.0f;
}
if(out_C - out_c * 4>=4){
input3 = read_imageh(input_image, sampler,input_pos3);
if(out_w%4==0){
output.w = input3.x;
}else if(out_w%4==1){
output.w = input3.y;
}else if(out_w%4==2){
output.w = input3.z;
}else{
output.w = input3.w;
}
}else{
output.w = 0.0f;
}
write_imageh(output_image, output_pos, output);
}
\ No newline at end of file
......@@ -22,11 +22,11 @@ namespace operators {
template <>
bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
if (param->InputX()->dims().size() <= 2) {
this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
} else {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
}
// if (param->InputX()->dims().size() <= 2) {
// this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
// } else {
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
// }
return true;
}
......@@ -49,11 +49,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
C = new_dims[1];
in_height = new_dims[2];
if (dim.size() <= 2) {
in_width = param.InputX()->ImageWidth();
} else {
in_width = new_dims[3];
}
// if (dim.size() <= 2) {
// in_width = param.InputX()->ImageWidth();
// } else {
in_width = new_dims[3];
// }
CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
......@@ -64,16 +64,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
if (dim.size() > 2) {
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
int out_c = new_dims[1];
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &out_c);
}
// if (dim.size() > 2) {
int size_ch = in_height * in_width;
int size_block = size_ch * 4;
int size_batch = size_ch * C;
int out_c = new_dims[1];
clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
clSetKernelArg(kernel, 5, sizeof(int), &size_block);
clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
clSetKernelArg(kernel, 7, sizeof(int), &out_c);
// }
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
......@@ -93,8 +93,6 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
// << "ms" << std::endl;
memcpy(out->data<float>(), out_cl_tensor.Data<float>(), out->memory_size());
DLOG << *param.InputX();
DLOG << *out;
}
template class FetchKernel<GPU_CL, float>;
......
......@@ -39,6 +39,10 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
const auto &input_aspect_ratio = param.AspectRatios();
const bool &flip = param.Flip();
const bool &clip = param.Clip();
int isclip = 0;
if (clip) {
isclip = 1;
}
const float &step_w = param.StepW();
const float &step_h = param.StepH();
const float &offset = param.Offset();
......@@ -75,6 +79,8 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
float *box_height = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
float *variancesptr =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s];
......@@ -108,6 +114,9 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
}
}
}
for (int i = 0; i < variances.size(); i++) {
variancesptr[i] = variances[i];
}
cl_int status;
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size =
......@@ -116,7 +125,7 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
int w = default_work_size[1];
int nh = default_work_size[2];
std::vector<int64_t> box_shape({1, 1, 1, num_priors});
std::vector<int64_t> box_shape({num_priors});
framework::DDim ddim = framework::make_ddim(box_shape);
framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
......@@ -131,16 +140,33 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
cl_mem box_height_Buffer =
box_height_cl_tensor.mutable_with_data<float>(box_height);
DLOG << "c_block:" << c_block;
DLOG << "w:" << w;
DLOG << "nh:" << nh;
DLOG << "step_width:" << step_width;
DLOG << "step_height:" << step_height;
DLOG << "offset:" << offset;
DLOG << "img_width:" << img_width;
DLOG << "img_height:" << img_height;
DLOG << "num_priors:" << num_priors;
DLOG << "C:" << C;
framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
this->cl_helper_.CLCommandQueue());
std::vector<int64_t> variances_shape({4});
framework::DDim vddim = framework::make_ddim(variances_shape);
variances_cl_tensor.Resize(vddim);
cl_mem variances_Buffer =
variances_cl_tensor.mutable_with_data<float>(variancesptr);
// DLOG << "c_block:" << c_block;
// DLOG << "w:" << w;
// DLOG << "nh:" << nh;
// DLOG << "step_width:" << step_width;
// DLOG << "step_height:" << step_height;
// DLOG << "offset:" << offset;
// DLOG << "img_width:" << img_width;
// DLOG << "img_height:" << img_height;
// DLOG << "num_priors:" << num_priors;
// DLOG << "C:" << C;
// DLOG << "isclip:" << isclip;
// printf("param.MinMaxAspectRatiosOrder() =
// %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
// num_priors; i++) {
// DLOG << box_width[i];
// DLOG << box_height[i];
// }
status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(int), &w);
......@@ -151,28 +177,36 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &C);
status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 14, sizeof(int), &C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
CL_CHECK_ERRORS(status);
size_t global_work_size[2] = {c_block, nh};
status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
NULL, global_work_size, NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
paddle_mobile::memory::Free(box_width);
paddle_mobile::memory::Free(box_height);
paddle_mobile::memory::Free(variancesptr);
}
template class PriorBoxKernel<GPU_CL, float>;
......
......@@ -26,40 +26,76 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
template <>
void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
const auto *input = param.InputX();
auto *output = param.Out();
auto inputImage = input->GetCLImage();
auto outputImage = output->GetCLImage();
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
auto input_image = input->GetCLImage();
auto output_image = output->GetCLImage();
const auto &inputDim = input->dims();
const auto &outputDim = output->dims();
int dims[4] = {1, 1, 1, 1};
int odims[4] = {1, 1, 1, 1};
int input_dims[4] = {1, 1, 1, 1};
int output_dims[4] = {1, 1, 1, 1};
// 1 1000 1 1
for (int i = 0; i < inputDim.size(); i++) {
dims[4 - inputDim.size() + i] = inputDim[i];
input_dims[4 - inputDim.size() + i] = inputDim[i];
}
// 1 1 1 1000
for (int i = 0; i < outputDim.size(); i++) {
odims[4 - outputDim.size() + i] = outputDim[i];
output_dims[4 - outputDim.size() + i] = outputDim[i];
}
clSetKernelArg(kernel, 2, sizeof(cl_int), &dims);
clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]);
clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]);
clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]);
clSetKernelArg(kernel, 6, sizeof(cl_int), &odims);
clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]);
clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]);
clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]);
const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
work_size, NULL, 0, NULL, NULL);
int out_C = output_dims[1];
int out_H = output_dims[2];
int out_W = output_dims[3];
int in_W = input_dims[3];
int in_H = input_dims[2];
int in_Stride0 = in_W;
int in_Stride1 = input_dims[2] * input_dims[3];
int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
int out_Stride0 = out_W;
int out_Stride1 = out_H * out_W;
int out_Stride2 = out_C * out_H * out_W;
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "out_W=" << out_W;
DLOG << "in_W=" << in_W;
DLOG << "default_work_size=" << default_work_size;
DLOG << "in_Stride0=" << in_Stride0;
DLOG << "in_Stride1=" << in_Stride1;
DLOG << "out_Stride0=" << out_Stride0;
DLOG << "out_Stride1=" << out_Stride1;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
template class ReshapeKernel<GPU_CL, float>;
......
......@@ -20,12 +20,48 @@ namespace operators {
template <>
bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
if (param->Out()->dims().size() == 4) {
this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
}
return true;
}
template <>
void TransposeKernel<GPU_CL, float>::Compute(
const TransposeParam<GPU_CL> &param) {}
const TransposeParam<GPU_CL> &param) {
if (param.Out()->dims().size() == 4) {
auto kernel = this->cl_helper_.KernelAt(0);
auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
int out_C = param.Out()->dims()[1];
int out_H = param.Out()->dims()[2];
int out_W = param.Out()->dims()[3];
int in_W = param.InputX()->dims()[3];
auto output_image = param.Out()->GetCLImage();
auto input_image = param.InputX()->GetCLImage();
DLOG << "out_C=" << out_C;
DLOG << "out_H=" << out_H;
DLOG << "out_W=" << out_W;
DLOG << "in_C=" << in_W;
DLOG << "default_work_size=" << default_work_size;
cl_int status;
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
CL_CHECK_ERRORS(status);
status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
CL_CHECK_ERRORS(status);
status = clEnqueueNDRangeKernel(
this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
NULL, default_work_size.data(), NULL, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -849,6 +849,8 @@ class PriorBoxParam : public OpParam {
if (HasAttr("min_max_aspect_ratios_order", attrs)) {
min_max_aspect_ratios_order_ =
GetAttr<bool>("min_max_aspect_ratios_order", attrs);
} else {
min_max_aspect_ratios_order_ = false;
}
flip_ = GetAttr<bool>("flip", attrs);
clip_ = GetAttr<bool>("clip", attrs);
......
......@@ -366,5 +366,8 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
target_link_libraries(test-eng paddle-mobile)
# gen test
ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
target_link_libraries(test-super paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif ()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../../src/common/types.h"
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time();
#ifdef PADDLE_MOBILE_CL
paddle_mobile.SetCLPath("/data/local/tmp/bin");
#endif
auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
std::string(g_super) + "/params", true, false,
1, true);
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
if (isok) {
auto time2 = paddle_mobile::time();
std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
<< std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 1, 300, 300};
GetInput<float>(g_yolo_img, &input, dims);
std::vector<float> vec_result;
auto time3 = paddle_mobile::time();
int max = 10;
for (int i = 0; i < max; ++i) {
vec_result = paddle_mobile.Predict(input, dims);
}
auto time4 = paddle_mobile::time();
std::cout << "predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl;
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
......@@ -36,16 +36,19 @@ static const char *g_squeezenet = "../models/squeezenet";
static const char *g_googlenet = "../models/googlenet";
static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
static const char *g_mobilenet = "../models/mobilenet";
static const char *g_mobilenet_mul = "../models/mobilenet_mul";
static const char *g_mobilenet_mul = "../models/r";
static const char *g_alexnet = "../models/alexnet";
static const char *g_inceptionv4 = "../models/inceptionv4";
static const char *g_inceptionv3 =
"../models/InceptionV3_Spatial_Attention_Model";
static const char *g_nlp = "../models/nlp";
static const char *g_super = "../models/superresoltion";
static const char *g_resnet_50 = "../models/resnet_50";
static const char *g_resnet = "../models/resnet";
static const char *g_googlenet_combine = "../models/googlenet_combine";
static const char *g_yolo = "../models/yolo";
static const char *g_yolo_combined = "../models/yolo_combined";
static const char *g_yolo_mul = "../models/yolo_mul";
static const char *g_yolo_mul = "../models/d";
static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
static const char *g_test_image_1x3x224x224 =
"../images/test_image_1x3x224x224_float";
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册