From 600c8c20be7104af0c22201c1c51d972640bebee Mon Sep 17 00:00:00 2001 From: Jiaying Zhao Date: Mon, 16 Dec 2019 15:21:05 +0800 Subject: [PATCH] [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel (#2601) * [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel * [LITE][OPENCL] Add depthwise_conv_3x3 opencl kernel. test=develop * [LITE][OPENCL] Add Pool opencl kernel. test=develop --- lite/backends/opencl/cl_image_converter.h | 1 + lite/backends/opencl/cl_kernel/cl_common.h | 6 +- .../image/depthwise_conv2d_kernel.cl | 322 ++++++++++++++++++ .../opencl/cl_kernel/image/pool_kernel.cl | 56 +-- lite/backends/opencl/target_wrapper.cc | 21 +- lite/backends/opencl/target_wrapper.h | 3 +- lite/core/memory.h | 5 +- lite/core/tensor.h | 6 +- lite/kernels/opencl/CMakeLists.txt | 4 +- .../opencl/depthwise_conv2d_compute.cc | 236 +++++++++++++ .../opencl/depthwise_conv2d_compute_test.cc | 131 ++++++- lite/kernels/opencl/pool_compute.cc | 118 +++++++ lite/kernels/opencl/pool_compute_test.cc | 97 +++++- 13 files changed, 962 insertions(+), 44 deletions(-) create mode 100755 lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h index e318a0b86a..962eb8d3ef 100644 --- a/lite/backends/opencl/cl_image_converter.h +++ b/lite/backends/opencl/cl_image_converter.h @@ -114,6 +114,7 @@ class CLImageConverterNWBlock : public CLImageConverterBase { const DDim &tensor_dim) override; }; class CLImageConverterDWBlock : public CLImageConverterBase { + public: DDim InitImageDimInfoWith(const DDim &tensor_dim) override; void NCHWToImage(float *tensor, float *image, diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index 815409eefd..8f60ea4503 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -40,10 +40,10 @@ limitations under the License. */ #define WRITE_IMG_TYPE(type_char, img, pos, value) \ _WRITE_IMG_TYPE(type_char, img, pos, value) -#define _READ_IMG_TYPE(type_char, img, pos, sampler) \ +#define _READ_IMG_TYPE(type_char, img, sampler, pos) \ read_image##type_char(img, sampler, pos) -#define READ_IMG_TYPE(type_char, img, pos, sampler) \ - _READ_IMG_TYPE(type_char, img, pos, sampler) +#define READ_IMG_TYPE(type_char, img, sampler, pos) \ + _READ_IMG_TYPE(type_char, img, sampler, pos) inline CL_DTYPE activation(CL_DTYPE in #ifdef PRELU diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl new file mode 100755 index 0000000000..8bb7be6a42 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -0,0 +1,322 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#include + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int dilation, + __private const int input_c, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE_CH + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos); +#else + CL_DTYPE4 output = 0.0f; +#endif + + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x ; + int filter_y = pos_in_filter_block.y ; + + CL_DTYPE4 inputs[9]; + + inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + CL_DTYPE4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + + inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (CL_DTYPE4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + CL_DTYPE4 filters[9]; + filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); + filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); + filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); + filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); + filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); + filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); + filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); + filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + + for(int i = 0 ;i < 9 ; i++){ + output += inputs[i] * filters[i]; + } +#ifdef BATCH_NORM + output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + CL_DTYPE4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); + +} + + + +__kernel void depth_conv_3x3s1(__private const int ou_ch_blk, + __private const int ou_w_blk, + __private const int ou_nh, + __read_only image2d_t input, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int in_ch, + __private const int in_w,/* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, + __private const int ou_h) { + + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; + + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); + + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + +#ifdef BIASE_CH + CL_DTYPE4 output[2]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; +#elif defined(BIASE_ELE) + CL_DTYPE4 output[2]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } +#else + CL_DTYPE4 output[2] = {0.0f}; +#endif + + CL_DTYPE4 inputs[12]; + + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + CL_DTYPE4 filters[9]; + filters[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y)); + filters[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y)); + filters[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y)); + + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); + + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0)); + + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); + + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); + + + filters[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1)); + filters[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1)); + filters[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + + + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1)); + inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1)); + inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1)); + inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1)); + + + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); + + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); + + + filters[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2)); + filters[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2)); + filters[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2)); + inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2)); + inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2)); + inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2)); + + + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); + + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); + + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); +#ifdef BATCH_NORM + CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0)); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0)); + output[0] = mad(scale, output[0], biase); + if (ou_col_id + 1 < ou_w) { + output[1] = mad(scale, output[1], biase); + } +#endif + +#ifdef RELU + output[0] = activation(output[0]); + output[1] = activation(output[1]); +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } + +} + diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl index 0ca3b9141d..4a7c53c980 100644 --- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl @@ -12,15 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #define MIN_VALUE -FLT_MAX -__kernel void pool_max( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { +__kernel void pool_max(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -40,25 +46,30 @@ __kernel void pool_max( const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; - float4 max_value = (float4)(MIN_VALUE); + CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE); for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); max_value = max(max_value, tmp); } } const int pos_out_x = mad24(out_c, out_width, out_w); - write_imagef(output, (int2)(pos_out_x, out_nh), max_value); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), max_value); } -__kernel void pool_avg( - __private const int in_height, __private const int in_width, - __private const int out_height, __private const int out_width, - __private const int pad_top, __private const int pad_left, - __private const int stride_h, __private const int stride_w, - __private const int ksize_h, __private const int ksize_w, - __read_only image2d_t input, __write_only image2d_t output) { +__kernel void pool_avg(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int out_width, + __private const int ksize_h, + __private const int ksize_w, + __private const int stride_h, + __private const int stride_w, + __private const int pad_top, + __private const int pad_left) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -76,15 +87,14 @@ __kernel void pool_avg( const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; - float4 sum = (float4)(0.0f); - int num = 0; + CL_DTYPE4 sum = (CL_DTYPE4)(0.0f); + for (int y = start_h; y < end_h; ++y) { for (int x = start_w; x < end_w; ++x) { - sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); - num++; + sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); } } - float4 avg = sum / num; + CL_DTYPE4 avg = sum / (ksize_h * ksize_w); const int pos_out_x = mad24(out_c, out_width, out_w); - write_imagef(output, (int2)(pos_out_x, out_nh), avg); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg); } diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc index 575f87d0f8..0dece6c582 100644 --- a/lite/backends/opencl/target_wrapper.cc +++ b/lite/backends/opencl/target_wrapper.cc @@ -58,17 +58,18 @@ void TargetWrapperCL::Free(void *ptr) { template <> void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height) { + const size_t cl_image2d_height, + void *host_ptr) { cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat))); cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), img_format, cl_image2d_width, cl_image2d_height, 0, - nullptr, + host_ptr, &status); if (status != CL_SUCCESS) { delete cl_image; @@ -80,17 +81,18 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, template <> void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height) { + const size_t cl_image2d_height, + void *host_ptr) { cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8))); cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), img_format, cl_image2d_width, cl_image2d_height, 0, - nullptr, + host_ptr, &status); if (status != CL_SUCCESS) { delete cl_image; @@ -102,17 +104,18 @@ void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, template <> void *TargetWrapperCL::MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height) { + const size_t cl_image2d_height, + void *host_ptr) { cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32))); cl_int status; cl::Image2D *cl_image = new cl::Image2D(CLRuntime::Global()->context(), - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), img_format, cl_image2d_width, cl_image2d_height, 0, - nullptr, + host_ptr, &status); if (status != CL_SUCCESS) { delete cl_image; diff --git a/lite/backends/opencl/target_wrapper.h b/lite/backends/opencl/target_wrapper.h index 7753448052..c5ff9e900a 100644 --- a/lite/backends/opencl/target_wrapper.h +++ b/lite/backends/opencl/target_wrapper.h @@ -48,7 +48,8 @@ class TargetWrapper { template static void* MallocImage(const size_t cl_image2d_width, - const size_t cl_image2d_height); + const size_t cl_image2d_height, + void* host_ptr = nullptr); static void FreeImage(void* image); static void* Map(void* buffer, size_t offset, size_t size); diff --git a/lite/core/memory.h b/lite/core/memory.h index cb4ac044e7..18b9958911 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -100,13 +100,14 @@ class Buffer { template void ResetLazyImage2D(TargetType target, const size_t img_w, - const size_t img_h) { + const size_t img_h, + void* host_ptr = nullptr) { size_t size = sizeof(T) * img_w * img_h * 4; // 4 for RGBA, un-used for opencl Image2D if (target != target_ || cl_image2d_width_ < img_w || cl_image2d_height_ < img_h) { Free(); - data_ = TargetWrapperCL::MallocImage(img_w, img_h); + data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); target_ = target; space_ = size; // un-used for opencl Image2D cl_image2d_width_ = img_w; diff --git a/lite/core/tensor.h b/lite/core/tensor.h index 8c4fe1604a..ca2e0e9a98 100644 --- a/lite/core/tensor.h +++ b/lite/core/tensor.h @@ -147,9 +147,11 @@ class TensorLite { #ifdef LITE_WITH_OPENCL template - R *mutable_data(const size_t img_w, const size_t img_h) { + R *mutable_data(const size_t img_w, + const size_t img_h, + void *host_ptr = nullptr) { target_ = TARGET(kOpenCL); - buffer_->ResetLazyImage2D(target_, img_w, img_h); + buffer_->ResetLazyImage2D(target_, img_w, img_h, host_ptr); return static_cast(buffer_->data()); } #endif diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index ebdd512597..c1b9798fc6 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -23,7 +23,7 @@ lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc - DEPS pool_opencl op_registry program context + DEPS pool_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc @@ -45,7 +45,7 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc # ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc - DEPS depthwise_conv2d_opencl op_registry program context + DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl) lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc index ed942d7f0c..ac1b1e715d 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc @@ -16,6 +16,7 @@ #include "lite/backends/opencl/cl_include.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" #include "lite/utils/replace_stl/stream.h" @@ -114,6 +115,216 @@ class DepthwiseConv2dCompute std::shared_ptr event_{new cl::Event}; }; +class DepthwiseConv2dComputeFP16Image + : public KernelLite { + public: + using param_t = operators::ConvParam; + + void PrepareForRun() override { + const auto& param = *param_.get_mutable(); + if (param.fuse_relu) { + build_options_ += " -DRELU"; + } + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_); + } + + void Run() override { + const auto& param = *param_.get_mutable(); + auto x_dims = param.x->dims(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto dilations = *param.dilations; + int offset = filter_dims[2] / 2 - paddings[0]; + int input_c_block = (x_dims[1] + 3) / 4; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + auto* input_img = param.x->data(); + auto* filter_img = param.filter->data(); + + auto* bias_img = param.bias == nullptr + ? static_cast(nullptr) + : param.bias->data(); + + auto image_shape = InitImageDimInfoWith(output_dims); + + auto* output_img = param.output->mutable_data( + image_shape["width"], image_shape["height"]); + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + int c_block = (output_dims[1] + 3) / 4; + int w = output_dims[3]; + int nh = output_dims[0] * output_dims[2]; + auto global_work_size = cl::NDRange(c_block, w, nh); + + LOG(INFO) << "setArg"; + LOG(INFO) << "c_block = " << c_block; + LOG(INFO) << "w = " << w; + LOG(INFO) << "nh = " << nh; + + LOG(INFO) << "strides = " << strides[0]; + LOG(INFO) << "offset = " << offset; + LOG(INFO) << "dilations = " << dilations[0]; + LOG(INFO) << "input_c_block = " << input_c_block; + LOG(INFO) << "x_dims[3] = " << x_dims[3]; + LOG(INFO) << "x_dims[2] = " << x_dims[2]; + LOG(INFO) << "output_dims[3] = " << output_dims[3]; + LOG(INFO) << "output_dims[2] = " << output_dims[2]; + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, static_cast(c_block)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(w)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(nh)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *output_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(strides[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(offset)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(dilations[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(input_c_block)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); + CL_CHECK_FATAL(status); + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(output_img, event_); + } + + private: + std::string kernel_func_name_{"depth_conv_3x3"}; + std::string build_options_{"-DCL_DTYPE_float"}; + std::shared_ptr event_{new cl::Event}; +}; + +class DepthwiseConv2d3x3s1ComputeFP16Image + : public KernelLite { + public: + using param_t = operators::ConvParam; + + void PrepareForRun() override { + const auto& param = *param_.get_mutable(); + if (param.fuse_relu) { + build_options_ += " -DRELU"; + } + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_); + } + + void Run() override { + const auto& param = *param_.get_mutable(); + auto x_dims = param.x->dims(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto dilations = *param.dilations; + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + auto* input_img = param.x->data(); + auto* filter_img = param.filter->data(); + + auto* bias_img = param.bias == nullptr + ? static_cast(nullptr) + : param.bias->data(); + + auto image_shape = InitImageDimInfoWith(output_dims); + + auto* output_img = param.output->mutable_data( + image_shape["width"], image_shape["height"]); + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + int c_block = (output_dims[1] + 3) / 4; + int w = output_dims[3]; + int nh = output_dims[0] * output_dims[2]; + + int w_blk_size = 2; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + + auto global_work_size = cl::NDRange(c_block, w_blk, nh); + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, static_cast(c_block)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(w_blk)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(nh)); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *output_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(strides[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(paddings[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(dilations[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(x_dims[1])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(x_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(x_dims[2])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(output_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); + CL_CHECK_FATAL(status); + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(output_img, event_); + } + + private: + std::string kernel_func_name_{"depth_conv_3x3s1"}; + std::string build_options_{"-DCL_DTYPE_float"}; + std::shared_ptr event_{new cl::Event}; +}; + } // namespace opencl } // namespace kernels } // namespace lite @@ -130,3 +341,28 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); + +REGISTER_LITE_KERNEL( + depthwise_conv2d, + kOpenCL, + kFloat, + kNHWC, + paddle::lite::kernels::opencl::DepthwiseConv2dComputeFP16Image, + image2d) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Bias", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Filter", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc index 3556d1abed..6ab78d4f25 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc @@ -14,6 +14,7 @@ #include #include +#include "lite/backends/opencl/cl_image_converter.h" #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/core/tensor.h" @@ -89,7 +90,7 @@ void depth_conv(const T* input_data, } } -TEST(depthwise_conv2d, compute) { +TEST(depthwise_conv2d, compute_buffer) { LOG(INFO) << "to get kernel ..."; auto kernels = KernelRegistry::Global().Create("depthwise_conv2d", TARGET(kOpenCL), @@ -176,7 +177,135 @@ TEST(depthwise_conv2d, compute) { TargetWrapperCL::Unmap(input_data, mapped_input); } +TEST(depthwise_conv2d, compute_image2d) { + LOG(INFO) << "to get kernel ..."; + auto kernels = KernelRegistry::Global().Create("depthwise_conv2d", + TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + + LOG(INFO) << "get kernel"; + lite::Tensor input, filter, output; + operators::ConvParam param; + param.x = &input; + param.filter = &filter; + param.output = &output; + std::vector paddings = {0, 0}; + param.paddings = std::make_shared>(paddings); + param.strides = std::vector{1, 1}; + std::vector dilations = {1, 1}; + param.dilations = std::make_shared>(dilations); + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + std::unique_ptr dep_context(new KernelContext); + context->As().CopySharedTo( + &(dep_context->As())); + kernel->SetContext(std::move(dep_context)); + + LOG(INFO) << "kernel ready"; + std::default_random_engine engine; + std::uniform_real_distribution gen(-5, 5); + std::vector input_v(1 * 32 * 112 * 112); + std::vector filter_v(32 * 1 * 3 * 3); + for (auto& i : input_v) { + i = gen(engine); + } + for (auto& f : filter_v) { + f = gen(engine); + } + + LOG(INFO) << "prepare input"; + input.Resize({1, 32, 112, 112}); + CLImageConverterDefault* default_converter = new CLImageConverterDefault(); + DDim input_image_shape = + default_converter->InitImageDimInfoWith(input.dims()); + LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " " + << input_image_shape[1]; + std::vector input_image_data(input_image_shape.production() * + 4); // 4 : RGBA + default_converter->NCHWToImage( + input_v.data(), input_image_data.data(), input.dims()); + auto* input_image = input.mutable_data( + input_image_shape[0], input_image_shape[1], input_image_data.data()); + + LOG(INFO) << "prepare kernel"; + filter.Resize({32, 1, 3, 3}); + CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock(); + DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims()); + LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " " + << filter_image_shape[1]; + std::vector filter_image_data(filter_image_shape.production() * + 4); // 4 : RGBA + nw_converter->NCHWToImage( + filter_v.data(), filter_image_data.data(), filter.dims()); + auto* filter_image = filter.mutable_data( + filter_image_shape[0], filter_image_shape[1], filter_image_data.data()); + + LOG(INFO) << "launch"; + output.Resize({1, 32, 110, 110}); + DDim output_image_shape = + default_converter->InitImageDimInfoWith(output.dims()); + LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " " + << output_image_shape[1]; + auto* output_image = output.mutable_data( + output_image_shape[0], output_image_shape[1]); + + kernel->Launch(); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; + LOG(INFO) << "--- Find the sync event for the target cl tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target cl tensor."; + LOG(INFO) << "Could not find the sync event for the target cl tensor."; + } + + lite::Tensor output_ref; + output_ref.Resize({1, 32, 110, 110}); + auto* output_ref_data = output_ref.mutable_data(TARGET(kARM)); + depth_conv(input_v.data(), + input.dims(), + filter_v.data(), + filter.dims(), + output_ref_data, + output_ref.dims()); + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + + float* output_image_data = new float[output_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(output_image_data, + output_image, + output_image_shape[0], + output_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + + float* output_data = new float[output_image_shape.production() * 4]; + default_converter->ImageToNCHW( + output_image_data, output_data, output_image_shape, output.dims()); + + LOG(INFO) << "output_data vs output_ref_data"; + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4); + LOG(INFO) << output_data[i] << " " << output_ref_data[i]; + } +} + } // namespace lite } // namespace paddle USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def); +USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNHWC, image2d); diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc index d275b312d6..8cdc127a37 100644 --- a/lite/kernels/opencl/pool_compute.cc +++ b/lite/kernels/opencl/pool_compute.cc @@ -16,6 +16,7 @@ #include "lite/backends/opencl/cl_include.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" +#include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" #include "lite/utils/replace_stl/stream.h" #include "lite/utils/string.h" @@ -117,6 +118,107 @@ class PoolCompute std::shared_ptr event_{new cl::Event}; }; +class PoolComputeImage2D + : public KernelLite { + public: + using param_t = operators::PoolParam; + + void PrepareForRun() override { + const auto& param = *param_.get_mutable(); + kernel_func_name_ += param.pooling_type; + auto& context = ctx_->As(); + context.cl_context()->AddKernel( + kernel_func_name_, "image/pool_kernel.cl", build_options_); + } + + void Run() override { + const auto& param = *param_.get_mutable(); + const auto& in_dims = param.x->dims(); + const auto& out_dims = param.output->dims(); + const std::string pooling_type = param.pooling_type; + const bool global_pooling = param.global_pooling; + std::vector paddings = *param.paddings; + std::vector strides = param.strides; + std::vector ksize = param.ksize; + if (global_pooling) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; + ksize[i] = static_cast(in_dims[i + 2]); + } + } + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + if (!pads_equal) { + LOG(FATAL) + << "padding requires pad_left == pad_right, pad_top == pad_bottom"; + } + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + + auto* x_img = param.x->data(); + LOG(INFO) << "x_image" << x_img; + + auto out_image_shape = InitImageDimInfoWith(out_dims); + LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " " + << out_image_shape["height"]; + auto* out_img = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + LOG(INFO) << "out_image" << out_img; + + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + + int c_block = (out_dims[1] + 3) / 4; + int w = out_dims[3]; + int nh = out_dims[0] * out_dims[2]; + auto global_work_size = cl::NDRange(c_block, w, nh); + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, *x_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(in_dims[2])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(in_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(out_dims[2])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(out_dims[3])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(ksize[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(ksize[1])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(strides[0])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(strides[1])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(paddings[2])); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(paddings[0])); + CL_CHECK_FATAL(status); + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_img, event_); + } + + private: + std::string kernel_func_name_{"pool_"}; + std::string build_options_{"-DCL_DTYPE_float"}; + std::shared_ptr event_{new cl::Event}; +}; + } // namespace opencl } // namespace kernels } // namespace lite @@ -131,3 +233,19 @@ REGISTER_LITE_KERNEL(pool2d, .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .Finalize(); + +REGISTER_LITE_KERNEL(pool2d, + kOpenCL, + kFloat, + kNHWC, + paddle::lite::kernels::opencl::PoolComputeImage2D, + image2d) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kOpenCL), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc index 25f0e72634..269c31d3bd 100644 --- a/lite/kernels/opencl/pool_compute_test.cc +++ b/lite/kernels/opencl/pool_compute_test.cc @@ -73,7 +73,7 @@ void pool_avg(const int padding_height, } } -TEST(pool2d, compute) { +TEST(pool2d, compute_buffer) { LOG(INFO) << "to get kernel ..."; auto kernels = KernelRegistry::Global().Create( "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)); @@ -143,7 +143,102 @@ TEST(pool2d, compute) { TargetWrapperCL::Unmap(out_data, mapped_out); } +TEST(pool2d, compute_image2d) { + LOG(INFO) << "to get kernel ..."; + auto kernels = KernelRegistry::Global().Create( + "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)); + ASSERT_FALSE(kernels.empty()); + + auto kernel = std::move(kernels.front()); + + LOG(INFO) << "get kernel"; + + lite::Tensor x, out; + operators::PoolParam param; + param.x = &x; + param.output = &out; + param.global_pooling = false; + param.pooling_type = "avg"; + std::vector paddings = {0, 0, 0, 0}; + param.strides = std::vector{1, 1}; + param.ksize = std::vector{7, 7}; + param.paddings = std::make_shared>(paddings); + + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + kernel->SetParam(param); + std::unique_ptr pool_context(new KernelContext); + context->As().CopySharedTo( + &(pool_context->As())); + kernel->SetContext(std::move(pool_context)); + + const DDim in_dim = DDim(std::vector{4, 11, 107, 107}); + const DDim out_dim = DDim(std::vector{4, 11, 101, 101}); + x.Resize(in_dim); + out.Resize(out_dim); + + std::default_random_engine engine; + std::uniform_real_distribution dist(-5, 5); + std::vector input_v(4 * 11 * 107 * 107); + for (auto& i : input_v) { + i = dist(engine); + } + + LOG(INFO) << "prepare input"; + CLImageConverterDefault* default_converter = new CLImageConverterDefault(); + DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim); + LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " " + << x_image_shape[1]; + std::vector x_image_data(x_image_shape.production() * 4); // 4 : RGBA + default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim); + auto* x_image = x.mutable_data( + x_image_shape[0], x_image_shape[1], x_image_data.data()); + LOG(INFO) << "x_image" << x_image; + + DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim); + LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " " + << out_image_shape[1]; + auto* out_image = out.mutable_data(out_image_shape[0], + out_image_shape[1]); + LOG(INFO) << "out_image" << out_image; + kernel->Launch(); + + auto* wait_list = context->As().cl_wait_list(); + auto* out_ptr = param.output->data(); + auto it = wait_list->find(out_ptr); + if (it != wait_list->end()) { + VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; + auto& event = *(it->second); + event.wait(); + } else { + LOG(FATAL) << "Could not find the sync event for the target cl tensor."; + } + + std::unique_ptr out_ref(new float[out_dim.production()]); + pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim); + + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + float* out_image_data = new float[out_image_shape.production() * 4]; + TargetWrapperCL::ImgcpySync(out_image_data, + out_image, + out_image_shape[0], + out_image_shape[1], + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + float* out_data = new float[out_image_shape.production() * 4]; + default_converter->ImageToNCHW( + out_image_data, out_data, out_image_shape, out_dim); + + for (int i = 0; i < out_dim.production(); i++) { + EXPECT_NEAR(out_data[i], out_ref[i], 1e-6); + } +} + } // namespace lite } // namespace paddle USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def); +USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNHWC, image2d); -- GitLab