From c899d438fd110755ede0367a66f479c743e8df52 Mon Sep 17 00:00:00 2001 From: liuqi Date: Sun, 3 Dec 2017 12:08:49 +0800 Subject: [PATCH] Finish max/avg pooling opencl kernel. --- mace/kernels/neon/pooling_neon.cc | 46 ++-- mace/kernels/opencl/cl/common.h | 3 + mace/kernels/opencl/cl/conv_2d.cl | 13 +- mace/kernels/opencl/cl/conv_2d_3x3.cl | 10 +- mace/kernels/opencl/cl/pooling.cl | 254 ++++++------------- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 1 - mace/kernels/opencl/pooling_opencl.cc | 157 +++++------- mace/kernels/pooling.h | 121 +++++---- mace/ops/pooling.h | 15 -- mace/ops/pooling_test.cc | 286 ++++++++++------------ 10 files changed, 384 insertions(+), 522 deletions(-) diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc index 0f916234..76868335 100644 --- a/mace/kernels/neon/pooling_neon.cc +++ b/mace/kernels/neon/pooling_neon.cc @@ -58,19 +58,27 @@ void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor) { + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape(4); + filter_shape[0] = input_tensor->shape()[1]; + filter_shape[1] = input_tensor->shape()[1]; + filter_shape[2] = kernels_[0]; + filter_shape[3] = kernels_[1]; + + kernels::CalcPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), this->dilations_, + strides_, this->padding_, output_shape.data(), + paddings.data()); + output_tensor->Resize(output_shape); + const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); #ifdef __COPY_MAKE_PADDING Tensor padded_input; - ConstructInputWithPadding(input_tensor, paddings, &padded_input); + ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input); input = padded_input.data(); input_shape = padded_input.shape().data(); #endif @@ -80,17 +88,17 @@ void PoolingFunctor::operator()( // kernel_size: 2x2, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && @@ -98,17 +106,17 @@ void PoolingFunctor::operator()( // kernel_size: 3x3, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else { // not implement yet diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 13d99c41..499c8164 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -18,4 +18,7 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) + +__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index d4a65670..e5ddb3d7 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -57,15 +57,14 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ // Unrolling this loop hurt perfmance for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { - for (short width_idx = 0; width_idx < filter_width; ++width_idx) { - - in_idx = in_ch_blk * in_width; - int in_hb_value = height_idx + hb_idx; - in_hb_value = select(in_hb_value + batch_idx, - -1, - (in_hb_value < 0 || in_hb_value >= in_height)); + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); + for (short width_idx = 0; width_idx < filter_width; ++width_idx) { + in_idx = in_ch_blk * in_width; int in_width_value; #define READ_INPUT(i) \ in_width_value = in_width##i + width_idx; \ diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 98847ab0..08bf04d3 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] // Unrolling this loop hurt perfmance for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); for (short width_idx = 0; width_idx < 3; ++width_idx) { in_idx = in_ch_blk * in_width; - - int in_hb_value = height_idx + hb_idx; - in_hb_value = select(in_hb_value + batch_idx, - -1, - (in_hb_value < 0 || in_hb_value >= in_height)); - int in_width_value; #define READ_INPUT(i) \ in_width_value = in_width##i + width_idx; \ diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index bc987ddd..b55202d3 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -1,193 +1,87 @@ #include -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2); - VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr); - DATA_TYPE row01 = *(input_ptr + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width); - DATA_TYPE row11 = *(input_ptr + in_width + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2); - DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2); - - VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2); - - DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2); - return res; -} - -// Supported data type: half/float -__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); - - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; - - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; - DATA_TYPE *output_base = output + out_offset + out_pixel_begin; - - const int pixels = out_pixel_end - out_pixel_begin; - - for (int i = out_chan_begin; i < out_chan_end; ++i) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - if (pixels == 4) { - VEC_DATA_TYPE(DATA_TYPE,4) res; -#ifdef STRIDE_1 - res = vec_pooling_3_s1(input_ptr, in_width); +#ifdef FP16 +#define MIN_VALUE -HALF_MAX #else - res = vec_pooling_3_s2(input_ptr, in_width); +#define MIN_VALUE -FLT_MAX #endif - vstore4(res, 0, output_ptr); - } else { - for (int p = 0; p < pixels; ++p) { - output_ptr[p] = inner_pooling_3(input_ptr, in_width); - input_ptr += stride; - } - } - } -} -int calculate_avg_block_size(const int pos_h, - const int pos_w, - const int pool_size, - const int pad_h, - const int pad_w, - const int h_size, - const int w_size) { - const int h_start = max(0, pos_h - pad_h); - const int w_start = max(0, pos_w - pad_w); - const int h_end = min(pos_h + pool_size - pad_h, h_size); - const int w_end = min(pos_w + pool_size - pad_w, w_size); +inline int calculate_avg_block_size(const int pool_size, + const int pos_h, + const int pos_w, + const int h_size, + const int w_size) { + const int h_start = max(0, pos_h); + const int w_start = max(0, pos_w); + const int h_end = min(pos_h + pool_size, h_size); + const int w_end = min(pos_w + pool_size, w_size); return (h_end - h_start) * (w_end - w_start); } // Supported data type: half/float -__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __private const int pad_h, - __private const int pad_w, - __private const int pooling_size, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_idx = get_global_id(1); - int out_pixel_idx = get_global_id(2); - - const int out_pixel_height = out_pixel_idx / out_width; - const int out_pixel_width = out_pixel_idx % out_width; - - const int out_chan_begin = out_chan_idx * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int in_pixel_idx = out_pixel_height * stride * in_width - + out_pixel_width * stride; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; - DATA_TYPE *output_base = output + out_offset + out_pixel_idx; - - const int block_size = calculate_avg_block_size( - out_pixel_height * stride, - out_pixel_width * stride, - pooling_size, - pad_h/2, - pad_w/2, - in_height - pad_h, - in_width - pad_w); - for (int i = out_chan_begin; i < out_chan_end; ++i) { - VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; - DATA_TYPE sum1 = 0.0f; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - for (int y = 0; y < pooling_size; ++y) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; - int x = 0; - for (; x < (pooling_size-8); x += 8) { - VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); - sum8 += data; - input_ptr += 8; - } - for (; x < pooling_size; ++x) { - sum1 += *input_ptr; - input_ptr++; +__kernel void pooling(__read_only image2d_t input, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int pad_top, + __private const int pad_left, + __private const int stride, + __private const int pooling_size, + __write_only image2d_t output) { + const int out_chan_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_width = get_global_size(1); + const int out_hb_idx = get_global_id(2); + + const int batch_idx = (out_hb_idx / out_height) * in_height; + const int in_height_start = (out_hb_idx % out_height) * stride - pad_top; + const int in_width_start = out_width_idx * stride - pad_left; + const int in_channel_offset = out_chan_idx * in_width; + + +#ifdef POOL_AVG + DATA_TYPE4 res = 0; + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = res + in; + } + } + const int block_size = calculate_avg_block_size(pooling_size, + in_height_start, in_width_start, + in_height, in_width); + res /= block_size; +#else + DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE); + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + if (in_height_idx != -1) { + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + if (in_width_idx != -1) { + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = fmax(res, in); + } } } - VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567; - VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23; - - *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size; } +#endif + + WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res); } diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 24bf90a1..7b7453ad 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, auto program = runtime->program(); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index fb9216f7..349c6195 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -10,131 +10,94 @@ namespace mace { namespace kernels { -static void Pooling3(const Tensor *input, - const int *stride, - const PoolingType type, - Tensor *output) { - if (type != MAX) { - MACE_NOT_IMPLEMENTED; - } +static void Pooling(const Tensor *input, + const int *stride, + const int *paddings, + const int pooling_size, + const PoolingType type, + const DataType dt, + Tensor *output) { index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); - index_t channel_blk = (channels + 3) / 4; - const index_t pixel_width = (out_width + 3) / 4 ; + index_t channel_blocks = (channels + 3) / 4; const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(pixel_width * out_height), + static_cast(channel_blocks), + static_cast(out_width), + static_cast(batch * out_height), }; auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); - built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); - auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); + if (type == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - const uint32_t lws[3] = {1, 8, 128}; + uint32_t lws[3]; + lws[0] = std::min(channel_blocks, kwg_size); + lws[1] = std::min(out_width, kwg_size / lws[0]); + lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, static_cast(input->dim(1))); pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); + pooling_kernel.setArg(idx++, paddings[0] / 2); + pooling_kernel.setArg(idx++, paddings[1] / 2); pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + pooling_kernel.setArg(idx++, pooling_size); + pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); + MACE_CHECK(error == CL_SUCCESS) << error; } -static void PoolingN(const Tensor *input, - const int *stride, - const int *paddings, - const int pooling_size, - const PoolingType type, - Tensor *output) { - if (type != AVG) { - MACE_NOT_IMPLEMENTED; - } - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); - - index_t channel_blk = (channels + 3) / 4; - const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(out_height * out_width), +template +void PoolingFunctor::operator()(const Tensor *input, + Tensor *output) { + MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input->dim(3), input->dim(3) }; - auto runtime = OpenCLRuntime::Get(); - std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); - auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); - const uint32_t lws[3] = {1, 8, 128}; + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); - uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); - pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); - pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); - pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, paddings[0]); - pooling_kernel.setArg(idx++, paddings[1]); - pooling_kernel.setArg(idx++, pooling_size); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, + DataTypeToEnum::value, output); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); -} - -template <> -void PoolingFunctor::operator()(const Tensor *input, - Tensor *output) { - int paddings[2]; - std::vector filter_shape = {input->dim(1), input->dim(0), - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); -#define POOLING_HELPER \ - switch(kernels_[0]) { \ - case 3: \ - Pooling3(input, strides_, pooling_type_, output); \ - break; \ - default: \ - PoolingN(input, strides_, paddings, kernels_[0], \ - pooling_type_, output); \ - break; \ - } - - if (paddings[0] > 0 || paddings[1] > 0) { - Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v()); - ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX); - input = &padded_input; - POOLING_HELPER - } else { - POOLING_HELPER - } -#undef POOLING_HELPER } +template +struct PoolingFunctor; +template +struct PoolingFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 11c05e47..0a1960a4 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -18,36 +18,66 @@ enum PoolingType { namespace kernels { -template -struct PoolingFunctor { - PoolingFunctor(const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding, - const int *dilations) +struct PoolingFunctorBase { + PoolingFunctorBase(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) : pooling_type_(pooling_type), kernels_(kernels), strides_(strides), padding_(padding), dilations_(dilations) {} + const PoolingType pooling_type_; + const int *kernels_; + const int *strides_; + const Padding padding_; + const int *dilations_; +}; + +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, Tensor *output_tensor) { + + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input_tensor->dim(3), input_tensor->dim(3) + }; + + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); + output_tensor->Resize(output_shape); + Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard out_guard(output_tensor); const T *input = input_tensor->data(); T *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; + index_t height = output_shape[1]; + index_t width = output_shape[2]; + index_t channels = output_shape[3]; index_t out_image_size = height * width; - index_t input_channels = input_shape[1]; - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; + index_t input_height = input_shape[1]; + index_t input_width = input_shape[2]; + index_t input_channels = input_shape[3]; index_t in_image_size = input_height * input_width; int kernel_h = kernels_[0]; @@ -59,11 +89,6 @@ struct PoolingFunctor { int dilation_h = dilations_[0]; int dilation_w = dilations_[1]; - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); // The left-upper most offset of the padded input int padded_h_start = 0 - paddings[0] / 2; int padded_w_start = 0 - paddings[1] / 2; @@ -71,25 +96,24 @@ struct PoolingFunctor { if (pooling_type_ == MAX) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - T max = std::numeric_limits::lowest(); + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; + T res = std::numeric_limits::lowest(); for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; - max = std::max(max, input[input_offset]); + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; + res = std::max(res, input[input_offset]); } } } - output[out_offset] = max; - out_offset += 1; + *output = res; + output++; } } } @@ -97,11 +121,10 @@ struct PoolingFunctor { } else if (pooling_type_ == AVG) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; T sum = 0; int block_size = 0; for (int kh = 0; kh < kernel_h; ++kh) { @@ -110,14 +133,14 @@ struct PoolingFunctor { int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; sum += input[input_offset]; block_size += 1; } } } - output[out_offset] = sum / block_size; - out_offset += 1; + *output = sum / block_size; + output++; } } } @@ -125,22 +148,26 @@ struct PoolingFunctor { } } - const PoolingType pooling_type_; - const int *kernels_; - const int *strides_; - const Padding padding_; - const int *dilations_; }; -template <> +template<> void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor); -template <> -void PoolingFunctor::operator()( - const Tensor *input_tensor, - Tensor *output_tensor); +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, + Tensor *output_tensor); +}; } // namespace kernels } // namespace mace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index f62992f5..bbc653ab 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); - std::vector output_shape(4); - std::vector paddings(2); - std::vector filter_shape(4); - // TODO(chenghui): is it kind of a hack? - filter_shape[0] = input->shape()[1]; - filter_shape[1] = input->shape()[0]; - filter_shape[2] = kernels_[0]; - filter_shape[3] = kernels_[1]; - - kernels::CalcPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), this->dilations_.data(), - this->strides_.data(), this->padding_, output_shape.data(), - paddings.data()); - output->Resize(output_shape); - functor_(input, output); return true; }; diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index bf2b1824..c02c976f 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Add input data net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); // Run net.RunOp(); // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); + CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, AVG_VALID) { - // Construct graph - auto &net = test_net(); - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::AVG) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - - // Run - net.RunOp(); - - // Check - auto expected = CreateTensor( - {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); -} TEST_F(PoolingOpTest, MAX_SAME) { // Construct graph @@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input", {1, 1, 3, 3}, - {0, 1, 2, 3, 4, 5, 6, 7, 8}); + net.AddInputFromArray("Input", {1, 3, 3, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {4, 5, 7, 8}); + auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 4, 4}, + "Input", {1, 4, 4, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {10, 11, 14, 15}); + auto expected = CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 2, 9}, + "Input", {1, 2, 9, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); // Run - net.RunOp(DeviceType::NEON); + net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 1, 5}, {10, 12, 14, 16, 17}); + auto expected = CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } - -template +template static void SimpleMaxPooling3S2() { // Construct graph OpsTestNet net; - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 1, 3, 9}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + "Input", {1, 3, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); - // Run - net.RunOp(D); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + // Run + OpDefBuilder("Pooling", "PoolingTest") + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + } // Check - auto expected = CreateTensor({1, 1, 1, 4}, {20, 22, 24, 26}); + auto expected = CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) { - SimpleMaxPooling3S2(); -} + TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -template -static void AlignedMaxPooling3S2(Padding padding) { +template +static void MaxPooling3S2(const std::vector &input_shape, + const std::vector strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") @@ -198,22 +185,33 @@ static void AlignedMaxPooling3S2(Padding padding) { .Output("Output") .AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 64, 64}); - // Run - net.RunOp(D); + net.AddRandomInput("Input", input_shape); + + // run on cpu + net.RunOp(); Tensor expected; expected.Copy(*net.GetOutput("Output")); - // Run on cpu - net.RunOp(); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); } // TODO(chenghui) : there is a bug. @@ -223,152 +221,140 @@ static void AlignedMaxPooling3S2(Padding padding) { //} TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { - AlignedMaxPooling3S2(Padding::VALID); - AlignedMaxPooling3S2(Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); } -template -static void UnalignedMaxPooling3S2(Padding padding) { +TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::SAME); +} + +TEST_F(PoolingOpTest, AVG_VALID) { // Construct graph - OpsTestNet net; + auto &net = test_net(); OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", padding) + .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::AVG) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 113, 43, 47}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddInputFromArray( + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - // Run on cpu + // Run net.RunOp(); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); -} - -// TODO(chenghui) : there is a bug. -//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) { -// UnalignedMaxPooling3S2(); -//} + // Check + auto expected = CreateTensor( + {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); -TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { - UnalignedMaxPooling3S2(Padding::VALID); - UnalignedMaxPooling3S2(Padding::SAME); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template static void SimpleAvgPoolingTest() { // Construct graph OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 2, 8, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 1, 2, 8}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); // Check - auto expected = CreateTensor({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5}); + auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, NEONSimpleAvgPooling) { - SimpleAvgPoolingTest(); -} - TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest(); } -template -static void AlignedAvgPoolingTest(Padding padding) { +template +static void AvgPoolingTest(const std::vector &shape, + const std::vector &kernels, + const std::vector &strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {4, 4}) - .AddIntsArg("strides", {4, 4}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 15, 15}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddRandomInput("Input", shape); - // Run on cpu + // run on cpu net.RunOp(); + Tensor expected; + expected.Copy(*net.GetOutput("Output")); - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); -} - -TEST_F(PoolingOpTest, NEONAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -template -static void UnAlignedAvgPoolingTest(Padding padding) { - // Construct graph - OpsTestNet net; + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {7, 7}) - .AddIntsArg("strides", {7, 7}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddRandomInput("Input", {3, 128, 31, 37}); - // Run net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - // Run on cpu - net.RunOp(); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); +} - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); +TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); } -TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); +TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME); } + +TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME); +} + -- GitLab