diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..8d7950d6b897df833ada56e2de5be7c6203de9ea --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -0,0 +1,428 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void conv2d_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height, + __private const int output_c, + __private const int filter_channel, + __private const int filter_width, + __private const int filter_height, + __private const int group) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE_CH + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); +#else + CL_DTYPE4 output = 0.0f; +#endif + + CL_DTYPE4 input[9]; // 3x3 region of input + if (group == 1) { + for (int i = 0; i < input_c; ++i) { // each run for 3x3 + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + + input[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + CL_DTYPE4 weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y += 3; + CL_DTYPE4 weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } else { // group != 1 + for (int i = 0; i < 4; i++) { + int used_input_channel_num = + (out_c * 4 + i) / (output_c / group) * filter_channel; + for (int f_c = 0; f_c < filter_channel; ++f_c) { + int input_c = used_input_channel_num + f_c; + int input_block = input_c / 4; + int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[1] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[2] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[3] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[4] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[5] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[6] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[7] = + select(READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[8] = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + CL_DTYPE tmp_out = 0; + for (int j = 0; j < 9; j++) { + int2 pos_of_weight; + pos_of_weight.x = (f_c / 4) * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; + CL_DTYPE4 weight = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, pos_of_weight); + + int f_c_offset = f_c % 4; + CL_DTYPE f_value; + if (f_c_offset == 0) { + f_value = weight.x; + } else if (f_c_offset == 1) { + f_value = weight.y; + } else if (f_c_offset == 2) { + f_value = weight.z; + } else if (f_c_offset == 3) { + f_value = weight.w; + } + + int input_c_offset = input_c % 4; + CL_DTYPE input_value; + if (input_c_offset == 0) { + input_value = input[j].x; + } else if (input_c_offset == 1) { + input_value = input[j].y; + } else if (input_c_offset == 2) { + input_value = input[j].z; + } else if (input_c_offset == 3) { + input_value = input[j].w; + } + tmp_out += f_value * input_value; + } + + if (i == 0) { + output.x += tmp_out; + } else if (i == 1) { + output.y += tmp_out; + } else if (i == 2) { + output.z += tmp_out; + } else if (i == 3) { + output.w += tmp_out; + } + } + } + } + + output = activation_type4(output); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index 0cc256478a80f17ce2efe15b8e43adc38a789921..c3d3e2a6c27f794268ef42ac97ab492ddd4e9de1 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -362,6 +362,20 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d1x1; + } else if (kernel_h == 3 && kernel_h == 3) { + // conv2d_3x3 + kernel_func_names_.push_back("conv2d_3x3"); + kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); + + CLImageConverterFolder converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d3x3; } else if (kernel_h == 5 && kernel_w == 5) { // conv2d_5x5 kernel_func_names_.push_back("conv2d_5x5"); @@ -582,6 +596,184 @@ void ConvImageCompute::Conv2d1x1() { CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_image, event_); } + +void ConvImageCompute::Conv2d3x3() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int input_channel = input_dims[1]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + int output_channel = output_dims[1]; + int filter_width = filter_dims[3]; + int filter_height = filter_dims[2]; + int filter_channel = filter_dims[1]; + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + int offset = static_cast(param.filter->dims()[2]) / 2 - + static_cast(paddings[0]); + + // calc input_c_block + auto input_image_shape = InitImageDimInfoWith(input_dims); + int input_c_block = input_image_shape["width"] / input_dims[3]; + int input_c = input_dims[1]; + auto dilations = *param.dilations; + + // re-calc group + int new_groups{param.groups}; + if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) { + new_groups = 1; + } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) { + new_groups = input_channel / filter_channel; + } + /* TODO(ysh329): mobile has no case below + else { + LOG(FATAL) << "Not support conv3x3 case with" + << " input_dims:" << input_dims << " output_dims:" << + output_dims + << " filter_dims:" << filter_dims; + } + */ + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + VLOG(4) << "============ conv2d params ============"; + VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + << input_image_shape["height"]; + VLOG(4) << "input_c_block: " << input_c_block; + VLOG(4) << "input_c: " << input_c; + VLOG(4) << "input_image: " << input_image; + VLOG(4) << "input_dims: " << input_dims; + VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "offset: " << offset; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "param.groups(groups):" << param.groups; + VLOG(4) << "new_groups:" << new_groups; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; + + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + VLOG(4) << "w: " << w; + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, nh); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + VLOG(4) << "set bias_image: "; + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, offset); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_c_block); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_channel); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, filter_channel); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, filter_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, filter_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, new_groups); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} + void ConvImageCompute::Conv2d5x5() { const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); @@ -726,6 +918,7 @@ void ConvImageCompute::Conv2d5x5() { CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_image, event_); } + void ConvImageCompute::Conv2d7x7() { const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_compute.h index 5b98767af0a740ce4a0adbc671000a36a156240e..d5dd65cdc855ebc25624e8316866a5944a2418b8 100644 --- a/lite/kernels/opencl/conv_compute.h +++ b/lite/kernels/opencl/conv_compute.h @@ -71,6 +71,7 @@ class ConvImageCompute : public KernelLite 1) { + filter_channel = 1; + } + + const int oh = + ConvOutputSize(ih, ksize, dilation, pad, pad, stride); + const int ow = + ConvOutputSize(iw, ksize, dilation, pad, pad, stride); + SHADOW_LOG << "to get kernel ..."; + auto kernels = + KernelRegistry::Global().Create("conv2d", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1"; + + auto kernel = std::move(kernels.front()); + SHADOW_LOG << "created conv2d kernel"; + + SHADOW_LOG << "prepare kernel ------"; + + lite::Tensor input, filter, bias, output; + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + param.groups = group; + if (bias_flag) { + param.bias = &bias; + } + if (relu_flag == "relu") { + param.fuse_relu = true; + } else if (relu_flag == "None") { + param.fuse_relu = false; + } else if (relu_flag == "relu6") { + param.activation_param.Relu_clipped_coef = 6.f; + param.activation_param.has_active = true; + param.activation_param.active_type = + lite_api::ActivationType::kRelu6; + } + + std::vector paddings = {pad, pad, pad, pad}; + std::vector dilations = {dilation, dilation}; + + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + param.strides = std::vector{stride, stride}; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr conv_1x1_context( + new KernelContext); + context->As().CopySharedTo( + &(conv_1x1_context->As())); + kernel->SetContext(std::move(conv_1x1_context)); + + const DDim& input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + + const DDim& filter_dim = lite::DDim{ + std::vector({oc, filter_channel, ksize, ksize})}; + const DDim& out_dim = + lite::DDim{std::vector({batch_size, oc, oh, ow})}; + // element wise bias + const DDim& bias_dim = lite::DDim{std::vector({oc})}; + + LOG(INFO) << "input_dim:" << input_dim + << " filter_dim:" << filter_dim + << " out_dim:" << out_dim; + + param.x->Resize(input_dim); + param.filter->Resize(filter_dim); + param.output->Resize(out_dim); + if (bias_flag) { + param.bias->Resize(bias_dim); + } + + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t out_image_width = ow * ((oc + 3) / 4); + size_t out_image_height = oh * batch_size; + + size_t bias_image_width = ow * ((oc + 3) / 4); + size_t bias_image_height = oh * batch_size; + + size_t filter_image_width = ksize * ((filter_channel + 3) / 4); + size_t filter_image_height = oc * ksize; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + + std::vector input_v(batch_size * ic * ih * iw); + std::vector filter_v(oc * filter_channel * ksize * ksize); + std::vector output_v(batch_size * oc * oh * ow); + std::vector bias_v(oc); + + SHADOW_LOG << "gen input and filter ..."; + for (int i = 0; i < input_v.size(); ++i) { + input_v[i] = i; // gen(engine); + } + for (int i = 0; i < filter_v.size(); ++i) { + filter_v[i] = 1; // gen(engine); + } + + SHADOW_LOG << "after gen input and filter ..."; + SHADOW_LOG << "input_v.size(): " << input_v.size(); + SHADOW_LOG << "filter_v.size(): " << filter_v.size(); + SHADOW_LOG << "output_v.size(): " << output_v.size(); + SHADOW_LOG << "bias_v.size(): " << bias_v.size(); + SHADOW_LOG << "input_dim.production(): " + << input_dim.production(); + SHADOW_LOG << "filter_dim.production(): " + << filter_dim.production(); + SHADOW_LOG << "out_dim.production(): " << out_dim.production(); + SHADOW_LOG << "bias_dim.production(): " << bias_dim.production(); + SHADOW_LOG << "input_image_height:" << input_image_height + << " input_image_width:" << input_image_width; + SHADOW_LOG << "filter_image_height:" << filter_image_height + << " filter_image_width:" << filter_image_width; + SHADOW_LOG << "4 * input_image_height *input_image_width: " + << 4 * input_image_height * input_image_width; + SHADOW_LOG << "4 * filter_image_width * filter_image_height: " + << 4 * filter_image_width * filter_image_height; + + CHECK(input_dim.production() == input_v.size()); + CHECK_LE(input_dim.production(), + 4 * input_image_height * input_image_width); + CHECK(filter_dim.production() == filter_v.size()); + CHECK_LE(filter_dim.production(), + 4 * filter_image_width * filter_image_height); + + paddle::lite::CLImageConverterDefault default_convertor; + SHADOW_LOG << "set mapped input ..."; + std::vector x_image_v(input_image_width * + input_image_height * 4); // 4 :RGBA + std::vector filter_image_v( + filter_image_width * filter_image_height * 4); // 4 : RGBA + std::vector bias_image_v( + bias_image_width * bias_image_height * 4); // 4 : RGBA + std::vector out_image_v(out_image_width * + out_image_height * 4); // 4 :RGBA + + default_convertor.NCHWToImage( + input_v.data(), x_image_v.data(), input_dim); + SHADOW_LOG << "输入: ---- "; + for (int i = 0; i < input_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << input_v[i]; + } + SHADOW_LOG << "输入image : ---- "; + for (int i = 0; i < x_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << x_image_v[i]; + } + SHADOW_LOG << "set mapped filter ..."; + CLImageConverterFolder folder_convertor; + + folder_convertor.NCHWToImage( + filter_v.data(), filter_image_v.data(), filter_dim); + SHADOW_LOG << "卷积核: ---- "; + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_v[i]; + } + SHADOW_LOG << "卷积核image: ---- "; + for (int i = 0; i < filter_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; + } + auto* input_image2d = input.mutable_data( + input_image_width, input_image_height, x_image_v.data()); + // assign filter as target arm + filter.Assign(filter_v.data(), + filter_dim); + // filter kernel + // auto* filter_image2d = filter.mutable_data( + // filter_image_width, + // filter_image_height, + // filter_image_v.data()); + + if (bias_flag) { + for (int i = 0; i < bias_dim.production(); ++i) { + bias_v[i] = static_cast(gen(engine)); + } + bias.Assign(bias_v.data(), + bias_dim); + // CLImageConverterFolder folder_convertor; + // folder_convertor.NCHWToImage( + // bias_v.data(), bias_image_v.data(), + // bias_dim); + // + // auto* bias_data = bias.mutable_data( + // bias_image_width, bias_image_height, + // bias_image_v.data()); + } + + SHADOW_LOG << "resize output ..."; + output.Resize(out_dim); + + // cpu conv basic calc + lite::Tensor out_ref; + out_ref.Resize(out_dim); + + SHADOW_LOG << "prepare kernel ready"; + + SHADOW_LOG << "kernel launch ..."; + kernel->Launch(); + SHADOW_LOG << "mutable output ..."; + auto* output_image2d = output.mutable_data( + out_image_width, out_image_height); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + + if (it != wait_list->end()) { + SHADOW_LOG << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target " + "cl tensor."; + } + + TargetWrapperCL::ImgcpySync(out_image_v.data(), + output.data(), + out_image_width, + out_image_height, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + + DDim out_image_shape = + default_convertor.InitImageDimInfoWith(output.dims()); + + default_convertor.ImageToNCHW(out_image_v.data(), + output_v.data(), + out_image_shape, + output.dims()); + + SHADOW_LOG << "输出: ---- "; + for (int i = 0; i < output_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << output_v[i]; + } + + SHADOW_LOG << "输出image: ---- "; + for (int i = 0; i < out_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << out_image_v[i]; + } + SHADOW_LOG << "mutable_data out_ref_data: "; + + // run cpu ref + auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); + + SHADOW_LOG << " conv_basic beigin ..... "; + + conv_basic(input_v.data(), + out_ref_data, + batch_size, + oc, + oh, + ow, + ic, + ih, + iw, + filter_v.data(), + bias_v.data(), // mapped_bias, + group, + ksize, + ksize, + stride, + stride, + dilation, + dilation, + pad, + pad, + bias_flag, + relu_flag); + SHADOW_LOG << " conv_basic end ..... "; + + SHADOW_LOG << " out_dim: " << out_dim; + const DDim& out_image_dims = lite::DDim{std::vector( + {static_cast(out_image_width), + static_cast(out_image_height)})}; + +#ifdef PRINT_RESULT + for (int i = 0; i < out_dim.production(); i++) { + VLOG(4) << "output_v[" << i << "]:" << output_v[i] + << " out_ref_data[" << i << "]:" << out_ref_data[i]; + } +#endif + + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); + if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { + LOG(FATAL) << "error idx:" << i; + } + } + +#ifdef LOOP_TEST + } + } + } + } + } + } +#else +// nothing to do. +#endif +} +#undef LOOP_TEST +#undef PRINT_RESULT + // #define PRINT_RESULT // #define LOOP_TEST TEST(conv2d, compute_image2d_5x5) {