From 8cfd96f2e0d53921f57785a6d79f41bf7974d127 Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Sat, 8 Feb 2020 10:46:47 +0800 Subject: [PATCH] =?UTF-8?q?[LITE][OPENCL][Image]develop=201x1/5x5/7x7=20ro?= =?UTF-8?q?uting=20in=20conv=5Fcompute=20,tes=E2=80=A6=20(#2818)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,test=develop * [LITE][OPENCL][Image]develop 1x1/5x5/7x7 routing in conv_compute ,convert bias filter in prepare for run ,test=develop --- .../cl_kernel/image/conv2d_1x1_kernel.cl | 4 +- .../cl_kernel/image/conv2d_5x5_kernel.cl | 169 +++ .../cl_kernel/image/conv2d_7x7_kernel.cl | 134 ++ lite/kernels/opencl/CMakeLists.txt | 6 +- lite/kernels/opencl/conv_compute.cc | 599 ++++++++- lite/kernels/opencl/conv_compute.h | 25 + .../opencl/conv_image2d_compute_test.cc | 1115 +++++++++++++++++ 7 files changed, 2048 insertions(+), 4 deletions(-) create mode 100644 lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl create mode 100644 lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl create mode 100644 lite/kernels/opencl/conv_image2d_compute_test.cc diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl index 2b037080b7..37e03e802c 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl @@ -360,12 +360,12 @@ __read_only image2d_t new_scale, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); #endif -#ifdef RELU + output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); -#endif + if (out_w0 < old_w) { WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl new file mode 100644 index 0000000000..d856af6a1d --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -0,0 +1,169 @@ +#include + +__kernel void conv2d_5x5(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; + + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh_in_one_batch; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE_CH + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); +#else + CL_DTYPE4 output = 0.0f; +#endif + + CL_DTYPE4 input; + CL_DTYPE4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y + batch_index * input_height); + for (int j = 0; j < 5; j++) { + for (int k = 0; k < 5; k++) { + input = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + (j - 2) * dilation, + pos_in.y + (k - 2) * dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 2) * dilation < 0 || + in_pos_in_one_block.y + (k - 2) * dilation < 0 || + in_pos_in_one_block.x + (j - 2) * dilation >= input_width || + in_pos_in_one_block.y + (k - 2) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 5 + filter_w; + filter_pos0.y = filter_n0 * 5 + filter_h; + + filter_pos1.x = filter_c * 5 + filter_w; + filter_pos1.y = filter_n1 * 5 + filter_h; + + filter_pos2.x = filter_c * 5 + filter_w; + filter_pos2.y = filter_n2 * 5 + filter_h; + + filter_pos3.x = filter_c * 5 + filter_w; + filter_pos3.y = filter_n3 * 5 + filter_h; + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0); + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1); + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2); + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + // + // if (output_pos.x == 0 && output_pos.y == 5) { + // printf("i,j,k ={ %d, %d , %d }\n", i,j,k); + // printf("in={ %f , %f , %f , %f } \n", + // convert_float(input.x), + // convert_float(input.y), + // convert_float(input.z), + // convert_float(input.w)); + // printf("filter0={ %f , %f , %f , %f } \n", + // convert_float(filter[0].x), + // convert_float(filter[0].y), + // convert_float(filter[0].z), + // convert_float(filter[0].w)); + // printf("filter1={ %f , %f , %f , %f } \n", + // convert_float(filter[1].x), + // convert_float(filter[1].y), + // convert_float(filter[1].z), + // convert_float(filter[1].w)); + // printf("filter2={ %f , %f , %f , %f } \n", + // convert_float(filter[2].x), + // convert_float(filter[2].y), + // convert_float(filter[2].z), + // convert_float(filter[2].w)); + // printf("filter3={ %f , %f , %f , %f } \n", + // convert_float(filter[3].x), + // convert_float(filter[3].y), + // convert_float(filter[3].z), + // convert_float(filter[3].w)); + // printf("output={ %f , %f , %f , %f } \n", + // convert_float(output.x), + // convert_float(output.y), + // convert_float(output.z), + // convert_float(output.w)); + // } + } + } + } + +#ifdef BATCH_NORM + output = + output * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); +#endif + + output = activation_type4(output); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); + } diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl new file mode 100644 index 0000000000..1f99322812 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -0,0 +1,134 @@ +#include + +__kernel void conv2d_7x7(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; + + const filter_n0 = 4 * out_c + 0; + const filter_n1 = 4 * out_c + 1; + const filter_n2 = 4 * out_c + 2; + const filter_n3 = 4 * out_c + 3; + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh_in_one_batch; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE_CH + CL_DTYPE4 output = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); +#else + CL_DTYPE4 output = 0.0f; +#endif + + CL_DTYPE4 input; + CL_DTYPE4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y + batch_index * input_height); + for (int j = 0; j < 7; j++) { + for (int k = 0; k < 7; k++) { + input = select( + READ_IMG_TYPE(CL_DTYPE_CHAR, + input_image, + sampler, + (int2)(pos_in.x + (j - 3) * dilation, + pos_in.y + (k - 3) * dilation)), + (CL_DTYPE4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 3) * dilation < 0 || + in_pos_in_one_block.y + (k - 3) * dilation < 0 || + in_pos_in_one_block.x + (j - 3) * dilation >= input_width || + in_pos_in_one_block.y + (k - 3) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0); + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1); + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2); + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } + } + } + +#ifdef BATCH_NORM + output = output * READ_IMG_TYPE( + CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); +#endif + + output = activation_type4(output); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index d8efd496e7..ab0fabf7af 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -17,7 +17,7 @@ add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps}) add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps}) #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps}) add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter) add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps}) lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc @@ -70,6 +70,10 @@ lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc DEPS conv_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) +lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc + DEPS conv_opencl op_registry program context cl_image_converter + ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) + lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc DEPS layout_opencl op_registry program context ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index 6bd61d660f..0cc256478a 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -13,9 +13,13 @@ // limitations under the License. #include "lite/kernels/opencl/conv_compute.h" + #include + +#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/cl_include.h" #include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" namespace paddle { @@ -242,7 +246,6 @@ void ConvCompute::Conv2d1x1() { GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k); } - // a: filter_d ==> <=> // b: x_d ==> <=> // c: output_d ==> <=> @@ -294,6 +297,582 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel, void ConvCompute::Run() { (this->*impl_)(); } +/* image kernel*/ +void ConvImageCompute::PrepareForRun() { + const auto& param = this->Param(); + auto x_dims = param.x->dims(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + float* filter_cpu = param.filter->mutable_data(); + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + + int bs = x_dims[0]; + int c_in = x_dims[1]; + int h_out = output_dims[2]; + int w_out = output_dims[3]; + int kernel_h = filter_dims[2]; // oihw + int kernel_w = filter_dims[3]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int stride_h = param.strides[0]; + int stride_w = param.strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; + int groups = param.groups; + bool relu_fused = param.fuse_relu; + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); + bool zero_pad = (pad_h == 0) && (pad_w == 0); + + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) && + (paddings[2] == paddings[3])); + bool stride_equal = stride_h == stride_w; + bool dilation_equal = dilations[0] == dilations[1]; + + CHECK(pad_equal && stride_equal && dilation_equal); + + VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); + VLOG(3) << "groups:" << groups << " stride_h:" << stride_h + << " stride_w:" << stride_w << " pad_h:" << pad_h + << " pad_w:" << pad_w << " kernel_h:" << kernel_h + << " kernel_h:" << kernel_h; + VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] + << " " << x_dims[3]; + VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " " + << output_dims[2] << " " << output_dims[3]; + VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " " + << filter_dims[2] << " " << filter_dims[3]; + if (kernel_h == 1 && kernel_w == 1) { + // conv2d_1x1 + if (param.x->dims()[1] % 4 == 0) { + kernel_func_names_.push_back("conv2d_1x1_simple"); + } else { + kernel_func_names_.push_back("conv2d_1x1"); + } + kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl"); + + CLImageConverterNWBlock converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d1x1; + } else if (kernel_h == 5 && kernel_w == 5) { + // conv2d_5x5 + kernel_func_names_.push_back("conv2d_5x5"); + kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl"); + + CLImageConverterFolder converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d5x5; + } else if (kernel_h == 7 && kernel_w == 7) { + // conv2d_7x7 + kernel_func_names_.push_back("conv2d_7x7"); + kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl"); + + CLImageConverterFolder converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + this->filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d7x7; + } else { + LOG(FATAL) << "conv image compute not support this condition yet! "; + } + + std::string build_options_single(" -DCL_DTYPE_float"); + // relu options + if (relu_fused) { + build_options_single += " -DRELU"; + } else if (param.activation_param.active_type == + lite_api::ActivationType::kRelu6) { + build_options_single += " -DRELU6"; + } else { + // do nothing + } + // bias options + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + if (has_bias) { + build_options_single += + is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH"; + + // convert cpu buffer bias --> gpu image + CLImageConverterFolder bias_converter; + const DDim& bias_image_dims = + bias_converter.InitImageDimInfoWith(param.bias->dims()); + std::vector bias_image_v(bias_image_dims[0] * bias_image_dims[1] * + 4); + float* bias_cpu_data = param.bias->mutable_data(); + bias_converter.NCHWToImage( + bias_cpu_data, bias_image_v.data(), param.bias->dims()); + this->bias_gpu_image_.mutable_data( + bias_image_dims[0], bias_image_dims[1], bias_image_v.data()); + // convert cpu buffer bias --> gpu image --- end ---- + } + + build_options_.push_back(build_options_single); + + for (size_t i = 0; i < kernel_func_names_.size(); i++) { + context.cl_context()->AddKernel( + kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]); + } +} + +void ConvImageCompute::Conv2d1x1() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + int offset = static_cast(param.filter->dims()[2]) / 2 - + static_cast(paddings[0]); + + // calc input_c_block + auto input_image_shape = InitImageDimInfoWith(input_dims); + int input_c_block = input_image_shape["width"] / input_dims[3]; + int input_c = input_dims[1]; + auto dilations = *param.dilations; + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + VLOG(4) << "============ conv2d_1x1 params ============"; + VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + << input_image_shape["height"]; + VLOG(4) << "input_c_block: " << input_c_block; + VLOG(4) << "input_c: " << input_c; + VLOG(4) << "input_image: " << input_image; + VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "offset: " << offset; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; + + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + // handle bias use buffer for channel wise , use image for element wise + const cl::Buffer* bias_buf = nullptr; + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + std::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + int maped_w = maptofactor(w, 4); + + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + VLOG(4) << "maped_w: " << maped_w; + VLOG(4) << "hasbias: " << has_bias; + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, maped_w); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, nh); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, offset); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_c); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(maped_w), + static_cast(default_work_size.data()[2])}; + + VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} +void ConvImageCompute::Conv2d5x5() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + int filter_width = filter_dims[3]; + int filter_height = filter_dims[2]; + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + int offset = static_cast(param.filter->dims()[2]) / 2 - + static_cast(paddings[0]); + + // calc input_c_block + auto input_image_shape = InitImageDimInfoWith(input_dims); + int input_c_block = input_image_shape["width"] / input_dims[3]; + int input_c = input_dims[1]; + auto dilations = *param.dilations; + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + VLOG(4) << "============ conv2d params ============"; + VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + << input_image_shape["height"]; + VLOG(4) << "input_c_block: " << input_c_block; + VLOG(4) << "input_c: " << input_c; + VLOG(4) << "input_image: " << input_image; + VLOG(4) << "input_dims: " << input_dims; + VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "offset: " << offset; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; + + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + VLOG(4) << "w: " << w; + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, nh); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + VLOG(4) << "set bias_image: "; + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, offset); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_c_block); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} +void ConvImageCompute::Conv2d7x7() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + int filter_width = filter_dims[3]; + int filter_height = filter_dims[2]; + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + int offset = static_cast(param.filter->dims()[2]) / 2 - + static_cast(paddings[0]); + + // calc input_c_block + auto input_image_shape = InitImageDimInfoWith(input_dims); + int input_c_block = input_image_shape["width"] / input_dims[3]; + int input_c = input_dims[1]; + auto dilations = *param.dilations; + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + VLOG(4) << "============ conv2d params ============"; + VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + << input_image_shape["height"]; + VLOG(4) << "input_c_block: " << input_c_block; + VLOG(4) << "input_c: " << input_c; + VLOG(4) << "input_image: " << input_image; + VLOG(4) << "input_dims: " << input_dims; + VLOG(4) << "filter_dims: " << filter_dims; + VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "offset: " << offset; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; + + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); + VLOG(4) << "w: " << w; + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, nh); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + VLOG(4) << "set bias_image: "; + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, offset); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_c_block); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(default_work_size.data()[1]), + static_cast(default_work_size.data()[2])}; + + VLOG(4) << "out_image: " << out_image; + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} + +void ConvImageCompute::Run() { (this->*impl_)(); } + } // namespace opencl } // namespace kernels } // namespace lite @@ -310,3 +889,21 @@ REGISTER_LITE_KERNEL(conv2d, .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); + +REGISTER_LITE_KERNEL(conv2d, + kOpenCL, + kFloat, + kImageDefault, + paddle::lite::kernels::opencl::ConvImageCompute, + image2d) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault))}) + .Finalize(); diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_compute.h index 37c8893bb8..5b98767af0 100644 --- a/lite/kernels/opencl/conv_compute.h +++ b/lite/kernels/opencl/conv_compute.h @@ -17,6 +17,7 @@ #include #include #include + #include "lite/backends/opencl/cl_include.h" #include "lite/core/kernel.h" #include "lite/core/tensor.h" @@ -57,6 +58,30 @@ class ConvCompute std::shared_ptr event_{new cl::Event}; }; +class ConvImageCompute : public KernelLite { + public: + using param_t = operators::ConvParam; + using kernel_t = void (ConvImageCompute::*)(); + + void PrepareForRun() override; + + void Run() override; + + private: + void Conv2d1x1(); + void Conv2d5x5(); + void Conv2d7x7(); + + kernel_t impl_; + std::vector kernel_func_names_{}; + std::vector kernel_func_paths_{}; + std::vector build_options_{}; + std::shared_ptr event_{new cl::Event}; + Tensor filter_gpu_image_; + Tensor bias_gpu_image_; +}; } // namespace opencl } // namespace kernels } // namespace lite diff --git a/lite/kernels/opencl/conv_image2d_compute_test.cc b/lite/kernels/opencl/conv_image2d_compute_test.cc new file mode 100644 index 0000000000..5404ffa868 --- /dev/null +++ b/lite/kernels/opencl/conv_image2d_compute_test.cc @@ -0,0 +1,1115 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/target_wrapper.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +// #define SHADOW_LOG LOG(INFO) +#define SHADOW_LOG VLOG(4) + +template +static void conv_basic(const Dtype1* din, + Dtype2* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const Dtype1* weights, + const Dtype2* bias, + int group, + int kernel_w, + int kernel_h, + int stride_w, + int stride_h, + int dila_w, + int dila_h, + int pad_w, + int pad_h, + bool flag_bias, + std::string flag_relu) { + Dtype2 beta = 0; + auto src_data = din; + auto dst_data_ref = dout; + auto weights_data = weights; + auto with_bias = flag_bias; + auto bias_data = bias; + + int in_num = num; + int out_channels = chout; + int out_h = hout; + int out_w = wout; + + int in_channel = chin; + int in_h = hin; + int in_w = win; + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + for (int n = 0; n < in_num; ++n) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + Dtype2 bias_d = + with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0; + dst_data_ref[out_idx] = bias_d; // + dst_data_ref[out_idx] * beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int widx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + + dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx]; + } + } + } + if (flag_relu == "relu") { + dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0 + ? dst_data_ref[out_idx] + : (Dtype2)0; + } else if (flag_relu == "relu6") { + auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0) + ? dst_data_ref[out_idx] + : (Dtype2)0; + dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f; + } + } + } + } + } + } +} +int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = + (input_size + (pad_left + pad_right) - dkernel) / stride + 1; + + return output_size; +} +// #define PRINT_RESULT +// #define LOOP_TEST +TEST(conv2d, compute_image2d_1x1) { + // conv infos + const int ksize = 1; + const int stride = 1; + const int pad = 0; + const int group = 1; + const int dilation = 0; +// int loop_cnt = 0; + +#ifdef LOOP_TEST + for (int batch_size = 1; batch_size < 4; ++batch_size) { + for (int oc = 4; oc < 10; oc += 1) { // oc + for (int ih = 4; ih < 9; ih += 1) { // ih + int iw = ih; + for (int iw = 4; iw < 10; iw += 1) { // iw + for (int ic = 4; ic < 10; ic += 1) { // ic + for (bool bias_flag : {true, false}) { + for (std::string relu_flag : {"relu"}) { +#else + const int batch_size = 1; + const int oc = 4; + const int ih = 8; + const int iw = 8; + const int ic = 4; + const bool bias_flag = true; + const std::string relu_flag = "relu"; +#endif + const int oh = ih; + const int ow = iw; + + SHADOW_LOG << "to get kernel ..."; + auto kernels = + KernelRegistry::Global().Create("conv2d", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + SHADOW_LOG << "created conv2d_1x1 kernel"; + + SHADOW_LOG << "prepare kernel ------"; + + lite::Tensor input, filter, bias, output; + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + if (bias_flag) { + param.bias = &bias; + } + if (relu_flag == "relu") { + param.fuse_relu = true; + } else if (relu_flag == "None") { + param.fuse_relu = false; + } else if (relu_flag == "relu6") { + param.activation_param.Relu_clipped_coef = 6.f; + param.activation_param.has_active = true; + param.activation_param.active_type = + lite_api::ActivationType::kRelu6; + } + + std::vector paddings = {pad, pad, pad, pad}; + std::vector dilations = {dilation, dilation}; + + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + param.strides = std::vector{stride, stride}; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr conv_1x1_context( + new KernelContext); + context->As().CopySharedTo( + &(conv_1x1_context->As())); + kernel->SetContext(std::move(conv_1x1_context)); + + const DDim& input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + + const DDim& filter_dim = + lite::DDim{std::vector({oc, ic, ksize, ksize})}; + const DDim& out_dim = + lite::DDim{std::vector({batch_size, oc, ih, iw})}; + // element wise bias + const DDim& bias_dim = lite::DDim{std::vector({oc})}; + + param.x->Resize(input_dim); + param.filter->Resize(filter_dim); + param.output->Resize(out_dim); + if (bias_flag) { + param.bias->Resize(bias_dim); + } + + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t out_image_width = ow * ((oc + 3) / 4); + size_t out_image_height = oh * batch_size; + + size_t bias_image_width = ow * ((oc + 3) / 4); + size_t bias_image_height = oh * batch_size; + + size_t filter_image_width = ksize * ((oc + 3) / 4); + size_t filter_image_height = ic * ksize; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + + std::vector input_v(batch_size * ic * ih * iw); + std::vector filter_v(oc * ic * ksize * ksize); + std::vector output_v(batch_size * oc * ih * iw); + std::vector bias_v(oc); + + SHADOW_LOG << "gen input and filter ..."; + + for (auto& i : input_v) { + i = gen(engine); + } + for (auto& f : filter_v) { + f = gen(engine); + } + + SHADOW_LOG << "after gen input and filter ..."; + SHADOW_LOG << "input_v.size(): " << input_v.size(); + SHADOW_LOG << "filter_v.size(): " << filter_v.size(); + SHADOW_LOG << "output_v.size(): " << output_v.size(); + SHADOW_LOG << "bias_v.size(): " << bias_v.size(); + SHADOW_LOG << "input_dim.production(): " + << input_dim.production(); + SHADOW_LOG << "filter_dim.production(): " + << filter_dim.production(); + SHADOW_LOG << "out_dim.production(): " << out_dim.production(); + SHADOW_LOG << "bias_dim.production(): " + << bias_dim.production(); + SHADOW_LOG << "4 * input_image_height * input_image_width: " + << 4 * input_image_height * input_image_width; + SHADOW_LOG << "4 * filter_image_width * filter_image_height: " + << 4 * filter_image_width * filter_image_height; + + CHECK(input_dim.production() == input_v.size()); + CHECK_LE(input_dim.production(), + 4 * input_image_height * input_image_width); + CHECK(filter_dim.production() == filter_v.size()); + CHECK_LE(filter_dim.production(), + 4 * filter_image_width * filter_image_height); + + paddle::lite::CLImageConverterDefault default_convertor; + SHADOW_LOG << "set mapped input ..."; + std::vector x_image_v( + input_image_width * input_image_height * 4); // 4 : RGBA + std::vector filter_image_v( + filter_image_width * filter_image_height * 4); // 4 :RGBA + std::vector bias_image_v( + bias_image_width * bias_image_height * 4); // 4 : RGBA + std::vector out_image_v( + out_image_width * out_image_height * 4); // 4 : RGBA + + default_convertor.NCHWToImage( + input_v.data(), x_image_v.data(), input_dim); + + SHADOW_LOG << "set mapped filter ..."; + paddle::lite::CLImageConverterNWBlock nw_convertor; + nw_convertor.NCHWToImage( + filter_v.data(), filter_image_v.data(), filter_dim); + + auto* input_image2d = input.mutable_data( + input_image_width, input_image_height, x_image_v.data()); + // assign filter as target arm + filter.Assign(filter_v.data(), + filter_dim); + // auto* filter_image2d = + // filter.mutable_data( + // filter_image_width, + // filter_image_height, + // filter_image_v.data()); + SHADOW_LOG << "卷积核: ---- "; + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_v[i]; + } + + SHADOW_LOG << "卷积核1: ---- "; + const float* filter_p = filter.data(); + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << *filter_p; + filter_p++; + } + SHADOW_LOG << "卷积核2: ---- "; + const float* filter_p2 = filter.mutable_data(); + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << *filter_p2; + filter_p2++; + } + if (bias_flag) { + for (int i = 0; i < bias_dim.production(); ++i) { + bias_v[i] = static_cast(gen(engine)); + } + bias.Assign(bias_v.data(), + bias_dim); + // CLImageConverterFolder folder_convertor; + // folder_convertor.NCHWToImage( + // bias_v.data(), bias_image_v.data(), + // bias_dim); + // + // auto* bias_data = bias.mutable_data( + // bias_image_width, bias_image_height, + // bias_image_v.data()); + } + + SHADOW_LOG << "resize output ..."; + output.Resize(out_dim); + + // cpu conv basic calc + lite::Tensor out_ref; + out_ref.Resize(out_dim); + + SHADOW_LOG << "prepare kernel ready"; + + SHADOW_LOG << "kernel launch ..."; + kernel->Launch(); + SHADOW_LOG << "mutable output ..."; + auto* output_image2d = output.mutable_data( + out_image_width, out_image_height); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + + if (it != wait_list->end()) { + SHADOW_LOG << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target" + "cl tensor."; + } + + TargetWrapperCL::ImgcpySync(out_image_v.data(), + output.data(), + out_image_width, + out_image_height, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + + DDim out_image_shape = + default_convertor.InitImageDimInfoWith(output.dims()); + + default_convertor.ImageToNCHW(out_image_v.data(), + output_v.data(), + out_image_shape, + output.dims()); + SHADOW_LOG << "mutable_data out_ref_data: "; + + // run cpu ref + auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); + + SHADOW_LOG << " conv_basic beigin ..... "; + + conv_basic(input_v.data(), + out_ref_data, + batch_size, + oc, + oh, + ow, + ic, + ih, + iw, + filter_v.data(), + bias_v.data(), // mapped_bias, + group, + ksize, + ksize, + stride, + stride, + dilation, + dilation, + pad, + pad, + bias_flag, + relu_flag); + SHADOW_LOG << " conv_basic end ..... "; + + SHADOW_LOG << " out_dim: " << out_dim; + const DDim& out_image_dims = lite::DDim{std::vector( + {static_cast(out_image_width), + static_cast(out_image_height)})}; + + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); + if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { + LOG(FATAL) << "error idx:" << i; + } + } + +#ifdef LOOP_TEST + } + } + } + } + } + } + } +#else +// nothing to do. +#endif +} +#undef LOOP_TEST +#undef PRINT_RESULT + +// #define PRINT_RESULT +// #define LOOP_TEST +TEST(conv2d, compute_image2d_5x5) { + // conv infos + const int ksize = 5; + const int stride = 1; + const int pad = 2; + const int group = 1; + const int dilation = 1; +// int loop_cnt = 0; + +#ifdef LOOP_TEST + for (int batch_size = 2; batch_size < 4; ++batch_size) { + for (int oc = 1; oc < 10; oc += 1) { // oc + for (int ih = 5; ih < 9; ih += 1) { // ih + int iw = ih; + for (int ic = 1; ic < 10; ic += 1) { // ic + for (bool bias_flag : {true, false}) { + for (std::string relu_flag : {/*true,*/ "relu"}) { +#else + const int batch_size = 2; + const int oc = 1; + const int ih = 5; + const int iw = 5; + const int ic = 1; + const bool bias_flag = true; + const std::string relu_flag = "relu"; +#endif + + const int oh = + ConvOutputSize(ih, ksize, dilation, pad, pad, stride); + const int ow = + ConvOutputSize(iw, ksize, dilation, pad, pad, stride); + SHADOW_LOG << "to get kernel ..."; + auto kernels = + KernelRegistry::Global().Create("conv2d", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + SHADOW_LOG << "created conv2d kernel"; + + SHADOW_LOG << "prepare kernel ------"; + + lite::Tensor input, filter, bias, output; + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + if (bias_flag) { + param.bias = &bias; + } + if (relu_flag == "relu") { + param.fuse_relu = true; + } else if (relu_flag == "None") { + param.fuse_relu = false; + } else if (relu_flag == "relu6") { + param.activation_param.Relu_clipped_coef = 6.f; + param.activation_param.has_active = true; + param.activation_param.active_type = + lite_api::ActivationType::kRelu6; + } + + std::vector paddings = {pad, pad, pad, pad}; + std::vector dilations = {dilation, dilation}; + + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + param.strides = std::vector{stride, stride}; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr conv_1x1_context( + new KernelContext); + context->As().CopySharedTo( + &(conv_1x1_context->As())); + kernel->SetContext(std::move(conv_1x1_context)); + + const DDim& input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + + const DDim& filter_dim = + lite::DDim{std::vector({oc, ic, ksize, ksize})}; + const DDim& out_dim = + lite::DDim{std::vector({batch_size, oc, oh, ow})}; + // element wise bias + const DDim& bias_dim = lite::DDim{std::vector({oc})}; + + param.x->Resize(input_dim); + param.filter->Resize(filter_dim); + param.output->Resize(out_dim); + if (bias_flag) { + param.bias->Resize(bias_dim); + } + + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t out_image_width = ow * ((oc + 3) / 4); + size_t out_image_height = oh * batch_size; + + size_t bias_image_width = ow * ((oc + 3) / 4); + size_t bias_image_height = oh * batch_size; + + size_t filter_image_width = ksize * ((ic + 3) / 4); + size_t filter_image_height = oc * ksize; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + + std::vector input_v(batch_size * ic * ih * iw); + std::vector filter_v(oc * ic * ksize * ksize); + std::vector output_v(batch_size * oc * oh * ow); + std::vector bias_v(oc); + + SHADOW_LOG << "gen input and filter ..."; + for (auto& i : input_v) { + i = gen(engine); + } + for (auto& f : filter_v) { + f = gen(engine); + } + + SHADOW_LOG << "after gen input and filter ..."; + SHADOW_LOG << "input_v.size(): " << input_v.size(); + SHADOW_LOG << "filter_v.size(): " << filter_v.size(); + SHADOW_LOG << "output_v.size(): " << output_v.size(); + SHADOW_LOG << "bias_v.size(): " << bias_v.size(); + SHADOW_LOG << "input_dim.production(): " + << input_dim.production(); + SHADOW_LOG << "filter_dim.production(): " + << filter_dim.production(); + SHADOW_LOG << "out_dim.production(): " << out_dim.production(); + SHADOW_LOG << "bias_dim.production(): " << bias_dim.production(); + SHADOW_LOG << "4 * input_image_height *input_image_width: " + << 4 * input_image_height * input_image_width; + SHADOW_LOG << "4 * filter_image_width * filter_image_height: " + << 4 * filter_image_width * filter_image_height; + + CHECK(input_dim.production() == input_v.size()); + CHECK_LE(input_dim.production(), + 4 * input_image_height * input_image_width); + CHECK(filter_dim.production() == filter_v.size()); + CHECK_LE(filter_dim.production(), + 4 * filter_image_width * filter_image_height); + + paddle::lite::CLImageConverterDefault default_convertor; + SHADOW_LOG << "set mapped input ..."; + std::vector x_image_v(input_image_width * + input_image_height * 4); // 4 :RGBA + std::vector filter_image_v( + filter_image_width * filter_image_height * 4); // 4 : RGBA + std::vector bias_image_v( + bias_image_width * bias_image_height * 4); // 4 : RGBA + std::vector out_image_v(out_image_width * + out_image_height * 4); // 4 :RGBA + + default_convertor.NCHWToImage( + input_v.data(), x_image_v.data(), input_dim); + SHADOW_LOG << "输入: ---- "; + for (int i = 0; i < input_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << input_v[i]; + } + SHADOW_LOG << "输入image : ---- "; + for (int i = 0; i < x_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << x_image_v[i]; + } + SHADOW_LOG << "set mapped filter ..."; + CLImageConverterFolder folder_convertor; + + folder_convertor.NCHWToImage( + filter_v.data(), filter_image_v.data(), filter_dim); + SHADOW_LOG << "卷积核: ---- "; + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_v[i]; + } + SHADOW_LOG << "卷积核image: ---- "; + for (int i = 0; i < filter_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; + } + auto* input_image2d = input.mutable_data( + input_image_width, input_image_height, x_image_v.data()); + // assign filter as target arm + filter.Assign(filter_v.data(), + filter_dim); + // filter kernel + // auto* filter_image2d = filter.mutable_data( + // filter_image_width, + // filter_image_height, + // filter_image_v.data()); + + if (bias_flag) { + for (int i = 0; i < bias_dim.production(); ++i) { + bias_v[i] = static_cast(gen(engine)); + } + bias.Assign(bias_v.data(), + bias_dim); + // CLImageConverterFolder folder_convertor; + // folder_convertor.NCHWToImage( + // bias_v.data(), bias_image_v.data(), + // bias_dim); + // + // auto* bias_data = bias.mutable_data( + // bias_image_width, bias_image_height, + // bias_image_v.data()); + } + + SHADOW_LOG << "resize output ..."; + output.Resize(out_dim); + + // cpu conv basic calc + lite::Tensor out_ref; + out_ref.Resize(out_dim); + + SHADOW_LOG << "prepare kernel ready"; + + SHADOW_LOG << "kernel launch ..."; + kernel->Launch(); + SHADOW_LOG << "mutable output ..."; + auto* output_image2d = output.mutable_data( + out_image_width, out_image_height); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + + if (it != wait_list->end()) { + SHADOW_LOG << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target " + "cl tensor."; + } + + TargetWrapperCL::ImgcpySync(out_image_v.data(), + output.data(), + out_image_width, + out_image_height, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + + DDim out_image_shape = + default_convertor.InitImageDimInfoWith(output.dims()); + + default_convertor.ImageToNCHW(out_image_v.data(), + output_v.data(), + out_image_shape, + output.dims()); + + SHADOW_LOG << "输出: ---- "; + for (int i = 0; i < output_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << output_v[i]; + } + + SHADOW_LOG << "输出image: ---- "; + for (int i = 0; i < out_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << out_image_v[i]; + } + SHADOW_LOG << "mutable_data out_ref_data: "; + + // run cpu ref + auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); + + SHADOW_LOG << " conv_basic beigin ..... "; + + conv_basic(input_v.data(), + out_ref_data, + batch_size, + oc, + oh, + ow, + ic, + ih, + iw, + filter_v.data(), + bias_v.data(), // mapped_bias, + group, + ksize, + ksize, + stride, + stride, + dilation, + dilation, + pad, + pad, + bias_flag, + relu_flag); + SHADOW_LOG << " conv_basic end ..... "; + + SHADOW_LOG << " out_dim: " << out_dim; + const DDim& out_image_dims = lite::DDim{std::vector( + {static_cast(out_image_width), + static_cast(out_image_height)})}; + + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); + if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { + LOG(FATAL) << "error idx:" << i; + } + } + +#ifdef LOOP_TEST + } + } + } + } + } + } +#else +// nothing to do. +#endif +} +#undef LOOP_TEST +#undef PRINT_RESULT + +// #define LOOP_TEST +TEST(conv2d, compute_image2d_7x7) { + // conv infos + const int ksize = 7; + const int stride = 1; + const int pad = 2; + const int group = 1; + const int dilation = 1; +// int loop_cnt = 0; + +#ifdef LOOP_TEST + for (int batch_size = 2; batch_size < 4; ++batch_size) { + for (int oc = 1; oc < 10; oc += 1) { // oc + for (int ih = 7; ih < 15; ih += 1) { // ih + int iw = ih; + for (int ic = 1; ic < 10; ic += 1) { // ic + for (bool bias_flag : {true, false}) { + for (std::string relu_flag : {"relu"}) { +#else + const int batch_size = 2; + const int oc = 1; + const int ih = 7; + const int iw = 7; + const int ic = 1; + const bool bias_flag = false; + const std::string relu_flag = ""; +#endif + + const int oh = + ConvOutputSize(ih, ksize, dilation, pad, pad, stride); + const int ow = + ConvOutputSize(iw, ksize, dilation, pad, pad, stride); + SHADOW_LOG << "to get kernel ..."; + auto kernels = + KernelRegistry::Global().Create("conv2d", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + SHADOW_LOG << "created conv2d kernel"; + + SHADOW_LOG << "prepare kernel ------"; + + lite::Tensor input, filter, bias, output; + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + if (bias_flag) { + param.bias = &bias; + } + if (relu_flag == "relu") { + param.fuse_relu = true; + } else if (relu_flag == "None") { + param.fuse_relu = false; + } else if (relu_flag == "relu6") { + param.activation_param.Relu_clipped_coef = 6.f; + param.activation_param.has_active = true; + param.activation_param.active_type = + lite_api::ActivationType::kRelu6; + } + std::vector paddings = {pad, pad, pad, pad}; + std::vector dilations = {dilation, dilation}; + + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + param.strides = std::vector{stride, stride}; + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + std::unique_ptr conv_1x1_context( + new KernelContext); + context->As().CopySharedTo( + &(conv_1x1_context->As())); + kernel->SetContext(std::move(conv_1x1_context)); + + const DDim& input_dim = + lite::DDim{std::vector({batch_size, ic, ih, iw})}; + + const DDim& filter_dim = + lite::DDim{std::vector({oc, ic, ksize, ksize})}; + const DDim& out_dim = + lite::DDim{std::vector({batch_size, oc, oh, ow})}; + // element wise bias + const DDim& bias_dim = lite::DDim{std::vector({oc})}; + + param.x->Resize(input_dim); + param.filter->Resize(filter_dim); + param.output->Resize(out_dim); + if (bias_flag) { + param.bias->Resize(bias_dim); + } + + kernel->SetParam(param); + + size_t input_image_width = iw * ((ic + 3) / 4); + size_t input_image_height = ih * batch_size; + + size_t out_image_width = ow * ((oc + 3) / 4); + size_t out_image_height = oh * batch_size; + + size_t bias_image_width = ow * ((oc + 3) / 4); + size_t bias_image_height = oh * batch_size; + + size_t filter_image_width = ksize * ((ic + 3) / 4); + size_t filter_image_height = oc * ksize; + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + + std::vector input_v(batch_size * ic * ih * iw); + std::vector filter_v(oc * ic * ksize * ksize); + std::vector output_v(batch_size * oc * oh * ow); + std::vector bias_v(oc); + + SHADOW_LOG << "gen input and filter ..."; + for (auto& i : input_v) { + i = gen(engine); + // i = 1; + } + for (auto& f : filter_v) { + f = gen(engine); + // f = 1; + } + LOG(INFO) << "bias: " << bias_flag; + LOG(INFO) << "relu: " << relu_flag; + + LOG(INFO) << "inputdims : " << input_dim; + LOG(INFO) << "filterdims: " << filter.dims(); + LOG(INFO) << "outputdims : " << output.dims(); + SHADOW_LOG << "after gen input and filter ..."; + SHADOW_LOG << "input_v.size(): " << input_v.size(); + SHADOW_LOG << "filter_v.size(): " << filter_v.size(); + SHADOW_LOG << "output_v.size(): " << output_v.size(); + SHADOW_LOG << "bias_v.size(): " << bias_v.size(); + SHADOW_LOG << "input_dim.production(): " + << input_dim.production(); + SHADOW_LOG << "filter_dim.production(): " + << filter_dim.production(); + SHADOW_LOG << "out_dim.production(): " << out_dim.production(); + SHADOW_LOG << "bias_dim.production(): " << bias_dim.production(); + SHADOW_LOG << "4 * input_image_height * input_image_width: " + << 4 * input_image_height * input_image_width; + SHADOW_LOG << "4 * filter_image_width * filter_image_height: " + << 4 * filter_image_width * filter_image_height; + + CHECK(input_dim.production() == input_v.size()); + CHECK_LE(input_dim.production(), + 4 * input_image_height * input_image_width); + CHECK(filter_dim.production() == filter_v.size()); + CHECK_LE(filter_dim.production(), + 4 * filter_image_width * filter_image_height); + + paddle::lite::CLImageConverterDefault default_convertor; + SHADOW_LOG << "set mapped input ..."; + std::vector x_image_v(input_image_width * + input_image_height * 4); // 4 : RGBA + std::vector filter_image_v( + filter_image_width * filter_image_height * 4); // 4 : RGBA + std::vector bias_image_v( + bias_image_width * bias_image_height * 4); // 4 : RGBA + std::vector out_image_v(out_image_width * + out_image_height * 4); // 4 : RGBA + + default_convertor.NCHWToImage( + input_v.data(), x_image_v.data(), input_dim); + SHADOW_LOG << "输入: ---- "; + for (int i = 0; i < input_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << input_v[i]; + } + SHADOW_LOG << "输入image : ---- "; + for (int i = 0; i < x_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << x_image_v[i]; + } + SHADOW_LOG << "set mapped filter ..."; + CLImageConverterFolder folder_convertor; + + folder_convertor.NCHWToImage( + filter_v.data(), filter_image_v.data(), filter_dim); + SHADOW_LOG << "卷积核: ---- "; + for (int i = 0; i < filter_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_v[i]; + } + SHADOW_LOG << "卷积核image: ---- "; + for (int i = 0; i < filter_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << filter_image_v[i]; + } + auto* input_image2d = input.mutable_data( + input_image_width, input_image_height, x_image_v.data()); + + // assign filter as target arm + filter.Assign(filter_v.data(), + filter_dim); + + // auto* filter_image2d = filter.mutable_data( + // filter_image_width, + // filter_image_height, + // filter_image_v.data()); + + if (bias_flag) { + for (int i = 0; i < bias_dim.production(); ++i) { + bias_v[i] = static_cast(gen(engine)); + } + bias.Assign(bias_v.data(), + bias_dim); + // CLImageConverterFolder folder_convertor; + // folder_convertor.NCHWToImage( + // bias_v.data(), bias_image_v.data(), + // bias_dim); + // + // auto* bias_data = bias.mutable_data( + // bias_image_width, bias_image_height, + // bias_image_v.data()); + } + + SHADOW_LOG << "resize output ..."; + output.Resize(out_dim); + + // cpu conv basic calc + lite::Tensor out_ref; + out_ref.Resize(out_dim); + + SHADOW_LOG << "prepare kernel ready"; + + SHADOW_LOG << "kernel launch ..."; + kernel->Launch(); + SHADOW_LOG << "mutable output ..."; + auto* output_image2d = output.mutable_data( + out_image_width, out_image_height); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + + if (it != wait_list->end()) { + SHADOW_LOG << "--- Find the sync event for the target cl " + "tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target " + "cl tensor."; + } + + TargetWrapperCL::ImgcpySync(out_image_v.data(), + output.data(), + out_image_width, + out_image_height, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + + DDim out_image_shape = + default_convertor.InitImageDimInfoWith(output.dims()); + + default_convertor.ImageToNCHW(out_image_v.data(), + output_v.data(), + out_image_shape, + output.dims()); + + SHADOW_LOG << "输出: ---- "; + for (int i = 0; i < output_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << output_v[i]; + } + + SHADOW_LOG << "输出image: ---- "; + for (int i = 0; i < out_image_v.size(); i++) { + SHADOW_LOG << "(" << i << ")" << out_image_v[i]; + } + SHADOW_LOG << "mutable_data out_ref_data: "; + + // run cpu ref + auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); + + SHADOW_LOG << " conv_basic beigin ..... "; + + conv_basic(input_v.data(), + out_ref_data, + batch_size, + oc, + oh, + ow, + ic, + ih, + iw, + filter_v.data(), + bias_v.data(), // mapped_bias, + group, + ksize, + ksize, + stride, + stride, + dilation, + dilation, + pad, + pad, + bias_flag, + relu_flag); + SHADOW_LOG << " conv_basic end ..... "; + + SHADOW_LOG << " out_dim: " << out_dim; + const DDim& out_image_dims = lite::DDim{std::vector( + {static_cast(out_image_width), + static_cast(out_image_height)})}; + + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2); + if (abs(output_v[i] - out_ref_data[i]) > 1e-2) { + LOG(FATAL) << "error idx:" << i; + } + } + +#ifdef LOOP_TEST + } + } + } + } + } + } +#else +// nothing to do. +#endif +} +#undef LOOP_TEST +#undef PRINT_RESULT +#undef SHADOW_LOG + +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d); -- GitLab